[llvm] [AMDGPU] Extending wave reduction intrinsics to support `i64` types. (PR #150169)

via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 25 02:26:16 PDT 2025


https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/150169

>From 2685c0efa55b514674a2257683675c75a1fd618e Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Sat, 19 Jul 2025 12:48:18 +0530
Subject: [PATCH 1/4] Extending wave reduction intrinsics to support `i64`
 types.

Supported Operations:
`min`, `max`, `umin`, `umax`, `and`, `or`, `xor`, `add`, `sub`
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  358 +++-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   49 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll  | 1356 ++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll  |  854 +++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll  |  900 +++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll  |  900 +++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll   |  857 +++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 1615 +++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll |  670 +++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll |  862 +++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll  | 1413 ++++++++++++++
 11 files changed, 9792 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d65c3ae76566b..1a2c614b09ca9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5095,19 +5095,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
   switch (Opc) {
   case AMDGPU::S_MIN_U32:
+  case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
     return std::numeric_limits<uint32_t>::max();
   case AMDGPU::S_MIN_I32:
+  case AMDGPU::V_CMP_LT_I64_e64: // min.i64
     return std::numeric_limits<int32_t>::max();
   case AMDGPU::S_MAX_U32:
+  case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
     return std::numeric_limits<uint32_t>::min();
   case AMDGPU::S_MAX_I32:
+  case AMDGPU::V_CMP_GT_I64_e64: // max.i64
     return std::numeric_limits<int32_t>::min();
   case AMDGPU::S_ADD_I32:
+  case AMDGPU::S_ADD_U64_PSEUDO:
   case AMDGPU::S_SUB_I32:
+  case AMDGPU::S_SUB_U64_PSEUDO:
   case AMDGPU::S_OR_B32:
+  case AMDGPU::S_OR_B64:
   case AMDGPU::S_XOR_B32:
+  case AMDGPU::S_XOR_B64:
     return std::numeric_limits<uint32_t>::min();
   case AMDGPU::S_AND_B32:
+  case AMDGPU::S_AND_B64:
     return std::numeric_limits<uint32_t>::max();
   default:
     llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
@@ -5128,26 +5137,38 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
   Register DstReg = MI.getOperand(0).getReg();
   MachineBasicBlock *RetBB = nullptr;
+  bool is32BitOpc = TRI->getRegSizeInBits(*MRI.getRegClass(DstReg)) == 32;
   if (isSGPR) {
     switch (Opc) {
     case AMDGPU::S_MIN_U32:
+    case AMDGPU::V_CMP_LT_U64_e64: /*umin*/
     case AMDGPU::S_MIN_I32:
+    case AMDGPU::V_CMP_LT_I64_e64: /*min*/
     case AMDGPU::S_MAX_U32:
+    case AMDGPU::V_CMP_GT_U64_e64: /*umax*/
     case AMDGPU::S_MAX_I32:
+    case AMDGPU::V_CMP_GT_I64_e64: /*max*/
     case AMDGPU::S_AND_B32:
-    case AMDGPU::S_OR_B32: {
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_OR_B32:
+    case AMDGPU::S_OR_B64: {
       // Idempotent operations.
-      BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+      unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+      BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
       RetBB = &BB;
       break;
     }
     case AMDGPU::S_XOR_B32:
+    case AMDGPU::S_XOR_B64:
     case AMDGPU::S_ADD_I32:
-    case AMDGPU::S_SUB_I32: {
+    case AMDGPU::S_ADD_U64_PSEUDO:
+    case AMDGPU::S_SUB_I32:
+    case AMDGPU::S_SUB_U64_PSEUDO: {
       const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
       const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
       Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
-      Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+      Register ActiveLanes =
+          MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
       bool IsWave32 = ST.isWave32();
       unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5162,21 +5183,68 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                                 .addReg(Exec->getOperand(0).getReg());
 
       switch (Opc) {
-      case AMDGPU::S_XOR_B32: {
+      case AMDGPU::S_XOR_B32:
+      case AMDGPU::S_XOR_B64: {
         // Performing an XOR operation on a uniform value
         // depends on the parity of the number of active lanes.
         // For even parity, the result will be 0, for odd
         // parity the result will be the same as the input value.
-        Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+        Register ParityRegister =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
         auto ParityReg =
             BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
                 .addReg(NewAccumulator->getOperand(0).getReg())
-                .addImm(1);
-        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
-            .addReg(SrcReg)
-            .addReg(ParityReg->getOperand(0).getReg());
-        break;
+                .addImm(1)
+                .setOperandDead(3); // Dead scc
+        if (is32BitOpc) {
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+              .addReg(SrcReg)
+              .addReg(ParityReg->getOperand(0).getReg());
+          break;
+        } else {
+          Register DestSub0 =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+          Register DestSub1 =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+          Register Op1H_Op0L_Reg =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+          Register CarryReg =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+          const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+          const TargetRegisterClass *SrcSubRC =
+              TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+          MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+              MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+          MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+              MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+              .add(Op1L)
+              .addReg(ParityReg->getOperand(0).getReg());
+
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
+              .add(Op1H)
+              .addReg(ParityReg->getOperand(0).getReg());
+
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
+              .add(Op1L)
+              .addReg(ParityReg->getOperand(0).getReg());
+
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
+              .addReg(CarryReg)
+              .addReg(Op1H_Op0L_Reg)
+              .setOperandDead(3); // Dead scc
+
+          BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+              .addReg(DestSub0)
+              .addImm(AMDGPU::sub0)
+              .addReg(DestSub1)
+              .addImm(AMDGPU::sub1);
+          break;
+        }
       }
       case AMDGPU::S_SUB_I32: {
         Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
@@ -5197,6 +5265,76 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             .addReg(NewAccumulator->getOperand(0).getReg());
         break;
       }
+      case AMDGPU::S_ADD_U64_PSEUDO:
+      case AMDGPU::S_SUB_U64_PSEUDO: {
+        Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register Op1H_Op0L_Reg =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register Op1L_Op0H_Reg =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register NegatedValLo =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register NegatedValHi =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+        const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
+        const TargetRegisterClass *Src1SubRC =
+            TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
+
+        MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+            MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
+        MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+            MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
+
+        if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedValLo)
+              .addReg(NewAccumulator->getOperand(0).getReg())
+              .addImm(-1);
+
+          MachineInstr *NegatedHi =
+              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
+                  .addReg(NegatedValLo)
+                  .addImm(31)
+                  .setOperandDead(3); // Dead scc
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
+              .add(Op1L)
+              .addReg(NegatedHi->getOperand(0).getReg());
+        }
+        Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
+                                 ? NegatedValLo
+                                 : NewAccumulator->getOperand(0).getReg();
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+            .add(Op1L)
+            .addReg(LowOpcode);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
+            .add(Op1L)
+            .addReg(LowOpcode);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
+            .add(Op1H)
+            .addReg(LowOpcode);
+
+        Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
+            .addReg(CarryReg)
+            .addReg(Op1H_Op0L_Reg)
+            .setOperandDead(3); // Dead scc
+
+        if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
+              .addReg(HiVal)
+              .addReg(Op1L_Op0H_Reg)
+              .setOperandDead(3); // Dead scc
+        }
+        BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+            .addReg(DestSub0)
+            .addImm(AMDGPU::sub0)
+            .addReg(DestSub1)
+            .addImm(AMDGPU::sub1);
+        break;
+      }
       }
       RetBB = &BB;
     }
@@ -5222,55 +5360,193 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
     const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
     Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
-    Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
-
+    Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
     Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
     Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
     Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-
-    Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
-    Register LaneValueReg =
-        MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
 
     bool IsWave32 = ST.isWave32();
-    unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
     // Create initial values of induction variable from Exec, Accumulator and
     // insert branch instr to newly created ComputeBlock
-    uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
-    auto TmpSReg =
-        BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
-    BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
-        .addImm(InitalValue);
+    uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
+    auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator)
+                       .addReg(ExecReg);
+    if (is32BitOpc) {
+      BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
+          .addImm(IdentityValue);
+    } else {
+      Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      MachineInstr *IdenHi =
+          BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
+              .addImm(IdentityValue);
+      switch (Opc) {
+      case AMDGPU::V_CMP_LT_U64_e64:
+      case AMDGPU::V_CMP_LT_I64_e64:
+        IdentityValue = int32_t(-1); // u|min
+        break;
+      case AMDGPU::V_CMP_GT_U64_e64:
+      case AMDGPU::V_CMP_GT_I64_e64:
+        IdentityValue = int32_t(0); // u|max
+        break;
+      }
+      MachineInstr *IdenLo =
+          BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
+              .addImm(IdentityValue);
+      BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
+          .addReg(IdenLo->getOperand(0).getReg())
+          .addImm(AMDGPU::sub0)
+          .addReg(IdenHi->getOperand(0).getReg())
+          .addImm(AMDGPU::sub1);
+    }
     // clang-format off
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
         .addMBB(ComputeLoop);
     // clang-format on
 
     // Start constructing ComputeLoop
-    I = ComputeLoop->end();
+    I = ComputeLoop->begin();
     auto Accumulator =
         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
-            .addReg(InitalValReg)
+            .addReg(IdentityValReg)
             .addMBB(&BB);
     auto ActiveBits =
         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
             .addReg(TmpSReg->getOperand(0).getReg())
             .addMBB(&BB);
 
+    I = ComputeLoop->end();
+    MachineInstr *NewAccumulator;
     // Perform the computations
     unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
     auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
                    .addReg(ActiveBits->getOperand(0).getReg());
-    auto LaneValue = BuildMI(*ComputeLoop, I, DL,
-                             TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
-                         .addReg(SrcReg)
-                         .addReg(FF1->getOperand(0).getReg());
-    auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
-                              .addReg(Accumulator->getOperand(0).getReg())
-                              .addReg(LaneValue->getOperand(0).getReg());
+    if (is32BitOpc) {
+      MachineInstr *LaneValue =
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+                  LaneValueReg)
+              .addReg(SrcReg)
+              .addReg(FF1->getOperand(0).getReg());
+      NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                           .addReg(Accumulator->getOperand(0).getReg())
+                           .addReg(LaneValue->getOperand(0).getReg());
+    } else {
+      Register LaneValueLoReg =
+          MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register LaneValueHiReg =
+          MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+      const TargetRegisterClass *SrcSubRC =
+          TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+      MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+          MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+      MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+          MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+      // lane value input should be in an sgpr
+      MachineInstr *LaneValueLo =
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+                  LaneValueLoReg)
+              .add(Op1L)
+              .addReg(FF1->getOperand(0).getReg());
+      MachineInstr *LaneValueHi =
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+                  LaneValueHiReg)
+              .add(Op1H)
+              .addReg(FF1->getOperand(0).getReg());
+      auto LaneValue = BuildMI(*ComputeLoop, I, DL,
+                               TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
+                           .addReg(LaneValueLo->getOperand(0).getReg())
+                           .addImm(AMDGPU::sub0)
+                           .addReg(LaneValueHi->getOperand(0).getReg())
+                           .addImm(AMDGPU::sub1);
+      switch (Opc) {
+      case ::AMDGPU::S_OR_B64:
+      case ::AMDGPU::S_AND_B64:
+      case ::AMDGPU::S_XOR_B64: {
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                             .addReg(Accumulator->getOperand(0).getReg())
+                             .addReg(LaneValue->getOperand(0).getReg())
+                             .setOperandDead(3); // Dead scc
+        break;
+      }
+      case AMDGPU::V_CMP_GT_I64_e64:
+      case AMDGPU::V_CMP_GT_U64_e64:
+      case AMDGPU::V_CMP_LT_I64_e64:
+      case AMDGPU::V_CMP_LT_U64_e64: {
+        Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+        Register ComparisonResultReg =
+            MRI.createVirtualRegister(WaveMaskRegClass);
+        const TargetRegisterClass *VregClass =
+            ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
+                                   : &AMDGPU::VReg_64RegClass;
+        const TargetRegisterClass *VSubRegClass =
+            TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
+        Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
+        MachineOperand SrcReg0Sub0 =
+            TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
+                                         VregClass, AMDGPU::sub0, VSubRegClass);
+        MachineOperand SrcReg0Sub1 =
+            TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
+                                         VregClass, AMDGPU::sub1, VSubRegClass);
+        BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+                AccumulatorVReg)
+            .add(SrcReg0Sub0)
+            .addImm(AMDGPU::sub0)
+            .add(SrcReg0Sub1)
+            .addImm(AMDGPU::sub1);
+        auto LaneMask = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
+                            .addReg(LaneValue->getOperand(0).getReg())
+                            .addReg(AccumulatorVReg);
+
+        unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+        BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
+            .addReg(LaneMask->getOperand(0).getReg())
+            .addReg(ActiveBits->getOperand(0).getReg());
 
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+                                 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
+                             .addReg(LaneValue->getOperand(0).getReg())
+                             .addReg(Accumulator->getOperand(0).getReg());
+        break;
+      }
+      case ::AMDGPU::S_ADD_U64_PSEUDO:
+      case ::AMDGPU::S_SUB_U64_PSEUDO: {
+        unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
+                                                           : AMDGPU::S_SUB_U32;
+        unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
+                                                           : AMDGPU::S_SUBB_U32;
+        Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
+            MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
+            &AMDGPU::SReg_32RegClass);
+        MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
+            MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
+            &AMDGPU::SReg_32RegClass);
+        MachineInstr *DestLoComputation =
+            BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
+                .add(Accumlo)
+                .addReg(LaneValueLo->getOperand(0).getReg());
+        MachineInstr *DestHiComputation =
+            BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
+                .add(Accumhi)
+                .addReg(LaneValueHi->getOperand(0).getReg());
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+                                 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+                             .addReg(DestLoComputation->getOperand(0).getReg())
+                             .addImm(AMDGPU::sub0)
+                             .addReg(DestHiComputation->getOperand(0).getReg())
+                             .addImm(AMDGPU::sub1);
+        break;
+      }
+      }
+    }
     // Manipulate the iterator to get the next active lane
     unsigned BITSETOpc =
         IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
@@ -5310,22 +5586,40 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   switch (MI.getOpcode()) {
   case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
   case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
   case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
   case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
   case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
+  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
     const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d05be8f95c618..f02b28681c5c1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -304,28 +304,57 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
     (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
 
 // clang-format off
-defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
+
 multiclass
-    AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
+    AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
   let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
     def !toupper(Op) #"_PSEUDO_" #DataType
-        : VPseudoInstSI<(outs SGPR_32 : $sdst),
-                        (ins VSrc_b32 : $src, VSrc_b32 : $strategy),
-                        [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
+        : VPseudoInstSI<(outs RetReg : $sdst),
+                        (ins Reg : $src, VSrc_b32 : $strategy),
+                        [(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
   }
 }
 // clang-format on
 
+class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
+                   RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
+  string Name = OpName;
+  string TypeString = TypeStr;
+  ValueType VT = Ty;
+  RegisterClass RetReg = ReturnRegisterClass;
+  SrcRegOrImm9 Reg = RC;
+}
+
 // Input list : [Operation_name,
-//              type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
+//              type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
+//              bit-width
+//              output register class,
+//              input register class]
 defvar Operations = [
-  ["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
-  ["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
-  ["xor", "B32"]
+  WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
+
+  WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"add", "I64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"sub", "I64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
 ];
 
 foreach Op = Operations in {
-  defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
+  defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
+                                                      Op.VT, Op.RetReg, Op.Reg>;
 }
 
 let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index d2ca1d8136043..b6af8b4bb798d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -1226,6 +1226,1362 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    s_mul_i32 s0, s2, s4
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s1, s2, s4
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s4
+; GFX8DAGISEL-NEXT:    s_add_u32 s1, s1, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s5, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX8GISEL-NEXT:    s_add_u32 s5, s2, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s3
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9GISEL-NEXT:    s_add_u32 s5, s2, s3
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s3
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, 0
+; GFX8DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s3
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, 0
+; GFX8GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[2:3]
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s3
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, 0
+; GFX9DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s3, s[2:3]
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s3
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, 0
+; GFX9GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s2
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s2, 0
+; GFX1064DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s2
+; GFX1064GISEL-NEXT:    s_mul_i32 s4, s2, 0
+; GFX1064GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, 0
+; GFX1032DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s4, s2, 0
+; GFX1032GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s2
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s4, s2, 0
+; GFX1164DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s2
+; GFX1164GISEL-NEXT:    s_mul_i32 s4, s2, 0
+; GFX1164GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s4, s2, 0
+; GFX1132DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s2
+; GFX1132GISEL-NEXT:    s_mul_i32 s4, s2, 0
+; GFX1132GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s3
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s1, s3
+; GFX8DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s4, s0, s3
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s1, s3
+; GFX8GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s3
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s1, s3
+; GFX9DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s3, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s4, s0, s3
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s1, s3
+; GFX9GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s3, s0, s2
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s1, s2
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s3, s0, s2
+; GFX1064GISEL-NEXT:    s_mul_i32 s4, s1, s2
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s3, s0, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s1, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s3, s0, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s4, s1, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s3, s0, s2
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s4, s1, s2
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s3, s0, s2
+; GFX1164GISEL-NEXT:    s_mul_i32 s4, s1, s2
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s3, s0, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s4, s1, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s3, s0, s2
+; GFX1132GISEL-NEXT:    s_mul_i32 s4, s1, s2
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX8DAGISEL-NEXT:    s_add_u32 s4, s4, s9
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX8DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX8GISEL-NEXT:    s_add_u32 s4, s4, s9
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX8GISEL-NEXT:    s_addc_u32 s5, s5, s10
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX9DAGISEL-NEXT:    s_add_u32 s4, s4, s9
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX9DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX9GISEL-NEXT:    s_add_u32 s4, s4, s9
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX9GISEL-NEXT:    s_addc_u32 s5, s5, s10
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX1064DAGISEL-NEXT:    s_add_u32 s4, s4, s9
+; GFX1064DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX1064GISEL-NEXT:    s_add_u32 s4, s4, s9
+; GFX1064GISEL-NEXT:    s_addc_u32 s5, s5, s10
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT:    s_add_u32 s4, s4, s8
+; GFX1032DAGISEL-NEXT:    s_addc_u32 s5, s5, s9
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_add_u32 s4, s4, s8
+; GFX1032GISEL-NEXT:    s_addc_u32 s5, s5, s9
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v2, s4
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v3, s4
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s4
+; GFX1164DAGISEL-NEXT:    s_add_u32 s0, s0, s5
+; GFX1164DAGISEL-NEXT:    s_addc_u32 s1, s1, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v2, s4
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v3, s4
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s4
+; GFX1164GISEL-NEXT:    s_add_u32 s0, s0, s5
+; GFX1164GISEL-NEXT:    s_addc_u32 s1, s1, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_add_u32 s0, s0, s4
+; GFX1132DAGISEL-NEXT:    s_addc_u32 s1, s1, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_add_u32 s0, s0, s4
+; GFX1132GISEL-NEXT:    s_addc_u32 s1, s1, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX8DAGISEL-NEXT:    s_add_u32 s7, s2, s3
+; GFX8DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
+; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s4, s7
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s4, s4, s7
+; GFX8DAGISEL-NEXT:    s_mul_i32 s5, s5, s7
+; GFX8DAGISEL-NEXT:    s_add_u32 s7, s4, s5
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX8GISEL-NEXT:    s_add_u32 s7, s2, s3
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s4, s7
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s4, s4, s7
+; GFX8GISEL-NEXT:    s_mul_i32 s5, s5, s7
+; GFX8GISEL-NEXT:    s_add_u32 s7, s4, s5
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s3
+; GFX9DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s6, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s6, s6, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s7, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s6, s5
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX9GISEL-NEXT:    s_add_u32 s7, s2, s3
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s8, s4
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s5, s8, s4
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s9, s4
+; GFX9GISEL-NEXT:    s_add_u32 s7, s5, s4
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s9, s2, s8
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
+; GFX1064DAGISEL-NEXT:    s_add_u32 s9, s9, s3
+; GFX1064DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s5, s6, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s7, s7, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s6, s4
+; GFX1064DAGISEL-NEXT:    s_add_u32 s5, s5, s7
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1064DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1064GISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s5, s6, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s7, s7, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
+; GFX1064GISEL-NEXT:    s_add_u32 s7, s5, s7
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s4
+; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s5, s3
+; GFX1032DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s5, s6, s3
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s7, s7, s3
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s6, s3
+; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s5, s7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1032GISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s4, s6, s3
+; GFX1032GISEL-NEXT:    s_mul_i32 s5, s7, s3
+; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
+; GFX1032GISEL-NEXT:    s_add_u32 s7, s4, s5
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s9, s2, s8
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
+; GFX1164DAGISEL-NEXT:    s_add_u32 s9, s9, s3
+; GFX1164DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s7, s4, s6
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s5, s5, s6
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_add_u32 s5, s7, s5
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1164GISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s7, s4, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s5, s5, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s6
+; GFX1164GISEL-NEXT:    s_add_u32 s7, s7, s5
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1132DAGISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1132DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s6, s4, s3
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s5, s5, s3
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s4, s4, s3
+; GFX1132DAGISEL-NEXT:    s_add_u32 s5, s6, s5
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1132GISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s7, s4, s3
+; GFX1132GISEL-NEXT:    s_mul_i32 s5, s5, s3
+; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
+; GFX1132GISEL-NEXT:    s_add_u32 s7, s7, s5
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10DAGISEL: {{.*}}
 ; GFX10GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
index 356b0e73b39e7..55e6189f65675 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -980,3 +980,857 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value_i64:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value_i64:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX8DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX8GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX1064DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX1064GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v3, s6
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
+; GFX1164DAGISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
+; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v3, s6
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
+; GFX1164GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
index 7dc0cb05b0abe..96b67e71fcd28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -980,3 +980,903 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value_i64:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value_i64:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_brev_b32 s5, 1
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX8DAGISEL-NEXT:    v_cmp_gt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX8DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX8DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_brev_b32 s5, 1
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX8GISEL-NEXT:    v_cmp_gt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX8GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX8GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_brev_b32 s5, 1
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX9DAGISEL-NEXT:    v_cmp_gt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX9DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX9DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_brev_b32 s5, 1
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX9GISEL-NEXT:    v_cmp_gt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX9GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX9GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_brev_b32 s5, 1
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX1064DAGISEL-NEXT:    v_cmp_gt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX1064DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX1064DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_brev_b32 s5, 1
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX1064GISEL-NEXT:    v_cmp_gt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX1064GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX1064GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT:    s_brev_b32 s5, 1
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[8:9], v[4:5]
+; GFX1032DAGISEL-NEXT:    s_and_b32 s10, vcc_lo, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_brev_b32 s5, 1
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[8:9], v[4:5]
+; GFX1032GISEL-NEXT:    s_and_b32 s10, vcc_lo, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v5, s1
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s8
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v3, s8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[4:5]
+; GFX1164DAGISEL-NEXT:    s_and_b64 s[6:7], vcc, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s8
+; GFX1164DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v5, s1
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s8
+; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v3, s8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[4:5]
+; GFX1164GISEL-NEXT:    s_and_b64 s[6:7], vcc, s[2:3]
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s8
+; GFX1164GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX1132DAGISEL-NEXT:    s_and_b32 s6, vcc_lo, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX1132GISEL-NEXT:    s_and_b32 s6, vcc_lo, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
index 7cb0e6533c722..4e0c9ec111cbe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
@@ -980,3 +980,903 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value_i64:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value_i64:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_brev_b32 s5, -2
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX8DAGISEL-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX8DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX8DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_brev_b32 s5, -2
+; GFX8GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX8GISEL-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX8GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX8GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_brev_b32 s5, -2
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX9DAGISEL-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX9DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX9DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_brev_b32 s5, -2
+; GFX9GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX9GISEL-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX9GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX9GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_brev_b32 s5, -2
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX1064DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX1064DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_brev_b32 s5, -2
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX1064GISEL-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[4:5]
+; GFX1064GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX1064GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT:    s_brev_b32 s5, -2
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[8:9], v[4:5]
+; GFX1032DAGISEL-NEXT:    s_and_b32 s10, vcc_lo, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_brev_b32 s5, -2
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[8:9], v[4:5]
+; GFX1032GISEL-NEXT:    s_and_b32 s10, vcc_lo, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_brev_b32 s1, -2
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v5, s1
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s8
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v3, s8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[4:5]
+; GFX1164DAGISEL-NEXT:    s_and_b64 s[6:7], vcc, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s8
+; GFX1164DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_brev_b32 s1, -2
+; GFX1164GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v5, s1
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s8
+; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v3, s8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[4:5]
+; GFX1164GISEL-NEXT:    s_and_b64 s[6:7], vcc, s[2:3]
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s8
+; GFX1164GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, -2
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX1132DAGISEL-NEXT:    s_and_b32 s6, vcc_lo, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_brev_b32 s1, -2
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX1132GISEL-NEXT:    s_and_b32 s6, vcc_lo, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
index e08787e6ba70a..5dcd1de6d05d7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
@@ -12,6 +12,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
 
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
 define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: uniform_value:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -980,3 +982,858 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value_i64:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value_i64:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX8DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX8GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX1064DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX1064GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v3, s6
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
+; GFX1164DAGISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
+; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v3, s6
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
+; GFX1164GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  ; %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index edb888a21f735..aff5c01ee174b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -1275,6 +1275,1621 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    s_ashr_i32 s0, s4, 31
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    s_mul_i32 s1, s2, s0
+; GFX8DAGISEL-NEXT:    s_mul_i32 s0, s2, s4
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s4
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX8DAGISEL-NEXT:    s_add_u32 s2, s2, s3
+; GFX8DAGISEL-NEXT:    s_add_u32 s1, s2, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8GISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX8GISEL-NEXT:    s_ashr_i32 s4, s5, 31
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s2, s4
+; GFX8GISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX8GISEL-NEXT:    s_add_u32 s2, s2, s3
+; GFX8GISEL-NEXT:    s_add_u32 s5, s2, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX9DAGISEL-NEXT:    s_ashr_i32 s4, s5, 31
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s6, s2, s4
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s2, s2, s3
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX9GISEL-NEXT:    s_ashr_i32 s4, s5, 31
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s2, s4
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9GISEL-NEXT:    s_add_u32 s2, s2, s3
+; GFX9GISEL-NEXT:    s_add_u32 s5, s2, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1064DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s6, s3
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s3, s5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1064GISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s6, s3
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s3, s5
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1032DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s6, s3
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s3, s5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1032GISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032GISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s6, s3
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s3, s5
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1164DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s6, s3
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s3, s5
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1164GISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1164GISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s6, s3
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s3, s5
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s6, s3
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s3, s5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1132GISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1132GISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s6, s3
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s3, s5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s2, -1
+; GFX8DAGISEL-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX8DAGISEL-NEXT:    s_mul_i32 s4, s2, 0x7b
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s5, 0x7b, s3
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, 0
+; GFX8DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX8DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s2, -1
+; GFX8GISEL-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX8GISEL-NEXT:    s_mul_i32 s4, s2, 0x7b
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s5, 0x7b, s3
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, 0
+; GFX8GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX8GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s2, -1
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, 0x7b
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s5, 0x7b, s3
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, 0
+; GFX9DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX9DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s2, -1
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s2, 0x7b
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s5, 0x7b, s3
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, 0
+; GFX9GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX9GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1064DAGISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s5, s2, 0
+; GFX1064DAGISEL-NEXT:    s_mulk_i32 s3, 0x7b
+; GFX1064DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1064DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1064GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1064GISEL-NEXT:    s_mul_i32 s5, s2, 0
+; GFX1064GISEL-NEXT:    s_mulk_i32 s3, 0x7b
+; GFX1064GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1064GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1032DAGISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s5, s2, 0
+; GFX1032DAGISEL-NEXT:    s_mulk_i32 s3, 0x7b
+; GFX1032DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1032DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1032GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s5, s2, 0
+; GFX1032GISEL-NEXT:    s_mulk_i32 s3, 0x7b
+; GFX1032GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1032GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1164DAGISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s5, s2, 0
+; GFX1164DAGISEL-NEXT:    s_mulk_i32 s3, 0x7b
+; GFX1164DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1164DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1164GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1164GISEL-NEXT:    s_mul_i32 s5, s2, 0
+; GFX1164GISEL-NEXT:    s_mulk_i32 s3, 0x7b
+; GFX1164GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1164GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s5, s2, 0
+; GFX1132DAGISEL-NEXT:    s_mulk_i32 s3, 0x7b
+; GFX1132DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1132DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1132GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1132GISEL-NEXT:    s_mul_i32 s5, s2, 0
+; GFX1132GISEL-NEXT:    s_mulk_i32 s3, 0x7b
+; GFX1132GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1132GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s2, -1
+; GFX8DAGISEL-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s4, s0, s2
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s1, s3
+; GFX8DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX8DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s2, -1
+; GFX8GISEL-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s4, s0, s2
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s1, s3
+; GFX8GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX8GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s2, -1
+; GFX9DAGISEL-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s0, s2
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s1, s3
+; GFX9DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX9DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s2, -1
+; GFX9GISEL-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s0, s2
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s1, s3
+; GFX9GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX9GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1064DAGISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s5, s1, s2
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s0, s3
+; GFX1064DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1064GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1064GISEL-NEXT:    s_mul_i32 s5, s1, s2
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s0, s3
+; GFX1064GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1032DAGISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s5, s1, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s0, s3
+; GFX1032DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1032GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s5, s1, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s0, s3
+; GFX1032GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1164DAGISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s5, s1, s2
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s0, s3
+; GFX1164DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1164GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1164GISEL-NEXT:    s_mul_i32 s5, s1, s2
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s0, s3
+; GFX1164GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s5, s1, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s0, s3
+; GFX1132DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1132GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1132GISEL-NEXT:    s_mul_i32 s5, s1, s2
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s0, s3
+; GFX1132GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX8DAGISEL-NEXT:    s_sub_u32 s4, s4, s9
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX8DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX8GISEL-NEXT:    s_sub_u32 s4, s4, s9
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX8GISEL-NEXT:    s_subb_u32 s5, s5, s10
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX9DAGISEL-NEXT:    s_sub_u32 s4, s4, s9
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX9DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX9GISEL-NEXT:    s_sub_u32 s4, s4, s9
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX9GISEL-NEXT:    s_subb_u32 s5, s5, s10
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX1064DAGISEL-NEXT:    s_sub_u32 s4, s4, s9
+; GFX1064DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v2, s8
+; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s8
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX1064GISEL-NEXT:    s_sub_u32 s4, s4, s9
+; GFX1064GISEL-NEXT:    s_subb_u32 s5, s5, s10
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT:    s_sub_u32 s4, s4, s8
+; GFX1032DAGISEL-NEXT:    s_subb_u32 s5, s5, s9
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_sub_u32 s4, s4, s8
+; GFX1032GISEL-NEXT:    s_subb_u32 s5, s5, s9
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v2, s4
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v3, s4
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s4
+; GFX1164DAGISEL-NEXT:    s_sub_u32 s0, s0, s5
+; GFX1164DAGISEL-NEXT:    s_subb_u32 s1, s1, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v2, s4
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v3, s4
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s4
+; GFX1164GISEL-NEXT:    s_sub_u32 s0, s0, s5
+; GFX1164GISEL-NEXT:    s_subb_u32 s1, s1, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_sub_u32 s0, s0, s4
+; GFX1132DAGISEL-NEXT:    s_subb_u32 s1, s1, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_sub_u32 s0, s0, s4
+; GFX1132GISEL-NEXT:    s_subb_u32 s1, s1, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX8DAGISEL-NEXT:    s_ashr_i32 s6, s7, 31
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s10, s2, s6
+; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX8DAGISEL-NEXT:    s_add_u32 s2, s2, s3
+; GFX8DAGISEL-NEXT:    s_add_u32 s7, s2, s10
+; GFX8DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX8DAGISEL-NEXT:    s_ashr_i32 s6, s7, 31
+; GFX8DAGISEL-NEXT:    s_mul_i32 s8, s4, s6
+; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s4, s7
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s4, s4, s7
+; GFX8DAGISEL-NEXT:    s_mul_i32 s5, s5, s7
+; GFX8DAGISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX8DAGISEL-NEXT:    s_add_u32 s7, s4, s8
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX8GISEL-NEXT:    s_ashr_i32 s6, s7, 31
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s10, s2, s6
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX8GISEL-NEXT:    s_add_u32 s2, s2, s3
+; GFX8GISEL-NEXT:    s_add_u32 s7, s2, s10
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX8GISEL-NEXT:    s_ashr_i32 s6, s7, 31
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s8, s4, s6
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s4, s7
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s4, s4, s7
+; GFX8GISEL-NEXT:    s_mul_i32 s5, s5, s7
+; GFX8GISEL-NEXT:    s_add_u32 s4, s4, s5
+; GFX8GISEL-NEXT:    s_add_u32 s7, s4, s8
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX9DAGISEL-NEXT:    s_ashr_i32 s4, s5, 31
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s10, s2, s4
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s2, s2, s3
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s10
+; GFX9DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX9DAGISEL-NEXT:    s_ashr_i32 s4, s5, 31
+; GFX9DAGISEL-NEXT:    s_mul_i32 s8, s6, s4
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s6, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s6, s6, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s7, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s6, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s5, s8
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9GISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX9GISEL-NEXT:    s_ashr_i32 s6, s7, 31
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s10, s2, s6
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX9GISEL-NEXT:    s_add_u32 s2, s2, s3
+; GFX9GISEL-NEXT:    s_add_u32 s7, s2, s10
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX9GISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s8, s4
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s7, s8, s4
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s9, s4
+; GFX9GISEL-NEXT:    s_mul_i32 s5, s8, s5
+; GFX9GISEL-NEXT:    s_add_u32 s4, s7, s4
+; GFX9GISEL-NEXT:    s_add_u32 s7, s4, s5
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s8, -1
+; GFX1064DAGISEL-NEXT:    s_ashr_i32 s9, s8, 31
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s10, s2, s8
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s9, s2, s9
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s10, s3
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
+; GFX1064DAGISEL-NEXT:    s_add_u32 s9, s3, s9
+; GFX1064DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1064DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s8, s6, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s7, s7, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s5, s6, s5
+; GFX1064DAGISEL-NEXT:    s_add_u32 s7, s8, s7
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s6, s4
+; GFX1064DAGISEL-NEXT:    s_add_u32 s5, s7, s5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1064DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1064GISEL-NEXT:    s_ashr_i32 s7, s6, 31
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s10, s2, s6
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1064GISEL-NEXT:    s_mul_i32 s7, s2, s7
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s10, s3
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1064GISEL-NEXT:    s_add_u32 s7, s3, s7
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1064GISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s8, s6, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s7, s7, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s5, s6, s5
+; GFX1064GISEL-NEXT:    s_add_u32 s7, s8, s7
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
+; GFX1064GISEL-NEXT:    s_add_u32 s7, s7, s5
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1032DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s9, s2, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s9, s3
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s4
+; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s3, s5
+; GFX1032DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, -1
+; GFX1032DAGISEL-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s5, s6, s3
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s7, s7, s3
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s8, s6, s4
+; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s5, s7
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s6, s3
+; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s5, s8
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1032GISEL-NEXT:    s_ashr_i32 s7, s6, 31
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s9, s2, s6
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1032GISEL-NEXT:    s_mul_i32 s7, s2, s7
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s9, s3
+; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1032GISEL-NEXT:    s_add_u32 s7, s3, s7
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, -1
+; GFX1032GISEL-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s5, s6, s3
+; GFX1032GISEL-NEXT:    s_mul_i32 s7, s7, s3
+; GFX1032GISEL-NEXT:    s_mul_i32 s4, s6, s4
+; GFX1032GISEL-NEXT:    s_add_u32 s5, s5, s7
+; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
+; GFX1032GISEL-NEXT:    s_add_u32 s7, s5, s4
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s8, -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_ashr_i32 s9, s8, 31
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s10, s2, s8
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s9, s2, s9
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s10, s3
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
+; GFX1164DAGISEL-NEXT:    s_add_u32 s9, s3, s9
+; GFX1164DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1164DAGISEL-NEXT:    s_ashr_i32 s7, s6, 31
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s5, s5, s6
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s7, s4, s7
+; GFX1164DAGISEL-NEXT:    s_add_u32 s5, s8, s5
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_add_u32 s5, s5, s7
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_ashr_i32 s7, s6, 31
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s10, s2, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s7, s2, s7
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s10, s3
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1164GISEL-NEXT:    s_add_u32 s7, s3, s7
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_ashr_i32 s7, s6, 31
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s5, s5, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s7, s4, s7
+; GFX1164GISEL-NEXT:    s_add_u32 s5, s8, s5
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s6
+; GFX1164GISEL-NEXT:    s_add_u32 s7, s5, s7
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_ashr_i32 s7, s6, 31
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s9, s2, s6
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s7, s2, s7
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s9, s3
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1132DAGISEL-NEXT:    s_add_u32 s7, s3, s7
+; GFX1132DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, -1
+; GFX1132DAGISEL-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s7, s4, s3
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s5, s5, s3
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s4, s6
+; GFX1132DAGISEL-NEXT:    s_add_u32 s5, s7, s5
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s4, s4, s3
+; GFX1132DAGISEL-NEXT:    s_add_u32 s5, s5, s6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_ashr_i32 s7, s6, 31
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s9, s2, s6
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1132GISEL-NEXT:    s_mul_i32 s7, s2, s7
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s9, s3
+; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1132GISEL-NEXT:    s_add_u32 s7, s3, s7
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s7, s4, s3
+; GFX1132GISEL-NEXT:    s_mul_i32 s5, s5, s3
+; GFX1132GISEL-NEXT:    s_mul_i32 s8, s4, s6
+; GFX1132GISEL-NEXT:    s_add_u32 s5, s7, s5
+; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
+; GFX1132GISEL-NEXT:    s_add_u32 s7, s5, s8
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10DAGISEL: {{.*}}
 ; GFX10GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index ba7a816184cd8..a4a5b01a873b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -947,3 +947,673 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x7b
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value_i64:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value_i64:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: divergent_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: divergent_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 3eaa89c957474..29a78855d6629 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -947,3 +947,865 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x7b
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value_i64:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value_i64:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value_i64:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value_i64:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[4:5]
+; GFX8DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX8DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX8GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[4:5]
+; GFX8GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX8GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[4:5]
+; GFX9DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX9DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX9GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[4:5]
+; GFX9GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX9GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[4:5]
+; GFX1064DAGISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX1064DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s12
+; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s12
+; GFX1064GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[4:5]
+; GFX1064GISEL-NEXT:    s_and_b64 s[10:11], vcc, s[6:7]
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
+; GFX1064GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[8:9], v[4:5]
+; GFX1032DAGISEL-NEXT:    s_and_b32 s10, vcc_lo, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[8:9], v[4:5]
+; GFX1032GISEL-NEXT:    s_and_b32 s10, vcc_lo, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v5, s1
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s8
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v3, s8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX1164DAGISEL-NEXT:    s_and_b64 s[6:7], vcc, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s8
+; GFX1164DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v5, s1
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s8
+; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v3, s8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX1164GISEL-NEXT:    s_and_b64 s[6:7], vcc, s[2:3]
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s8
+; GFX1164GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX1132DAGISEL-NEXT:    s_and_b32 s6, vcc_lo, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX1132GISEL-NEXT:    s_and_b32 s6, vcc_lo, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %if
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
index 5b21d5c3aaeb6..b96954d030fef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
@@ -1279,6 +1279,1419 @@ endif:
   store i32 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    s_mul_i32 s0, s2, s4
+; GFX8DAGISEL-NEXT:    s_mul_i32 s1, s3, s4
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s4
+; GFX8DAGISEL-NEXT:    s_add_u32 s1, s2, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8GISEL-NEXT:    s_and_b32 s5, s4, 1
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX8GISEL-NEXT:    s_add_u32 s5, s2, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT:    s_and_b32 s5, s4, 1
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s3
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT:    s_and_b32 s5, s4, 1
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9GISEL-NEXT:    s_add_u32 s5, s2, s3
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s4
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s5, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_and_b32 s3, s2, 1
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX8DAGISEL-NEXT:    s_mul_i32 s4, s3, 0
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s3
+; GFX8DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_and_b32 s3, s2, 1
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX8GISEL-NEXT:    s_mul_i32 s4, s3, 0
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s3
+; GFX8GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_and_b32 s3, s2, 1
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s3, 0
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s3
+; GFX9DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_and_b32 s3, s2, 1
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, 0x7b
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s3, 0
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s3, 0x7b, s3
+; GFX9GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s2, 0
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1064DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s2, 0
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1064GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s2, 0
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1032DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s2, 0
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1032GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s2, 0
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1164DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s2, 0
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1164GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s2, 0
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1132DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s2, 0
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s4, 0x7b, s2
+; GFX1132GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 123, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_and_b32 s3, s2, 1
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX8DAGISEL-NEXT:    s_mul_i32 s4, s1, s3
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s3, s0, s3
+; GFX8DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_and_b32 s3, s2, 1
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX8GISEL-NEXT:    s_mul_i32 s4, s1, s3
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s3, s0, s3
+; GFX8GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_and_b32 s3, s2, 1
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s1, s3
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s3, s0, s3
+; GFX9DAGISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_and_b32 s3, s2, 1
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s0, s3
+; GFX9GISEL-NEXT:    s_mul_i32 s4, s1, s3
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s3, s0, s3
+; GFX9GISEL-NEXT:    s_add_u32 s3, s3, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s1, s2
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s1, s2
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s1, s2
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s1, s2
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s1, s2
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s1, s2
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s1, s2
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s1, s2
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s4, s0, s2
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT:    s_add_u32 s3, s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 poison, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
+; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v3, s6
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
+; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v3, s6
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, s0
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %id.x, i32 1)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT:    s_and_b32 s7, s6, 1
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX8DAGISEL-NEXT:    s_add_u32 s7, s2, s3
+; GFX8DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT:    s_and_b32 s7, s6, 1
+; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s4, s7
+; GFX8DAGISEL-NEXT:    s_mul_i32 s5, s5, s7
+; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s4, s4, s7
+; GFX8DAGISEL-NEXT:    s_add_u32 s7, s4, s5
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT:    s_and_b32 s7, s6, 1
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX8GISEL-NEXT:    s_add_u32 s7, s2, s3
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT:    s_and_b32 s7, s6, 1
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s4, s7
+; GFX8GISEL-NEXT:    s_mul_i32 s5, s5, s7
+; GFX8GISEL-NEXT:    s_mul_hi_u32 s4, s4, s7
+; GFX8GISEL-NEXT:    s_add_u32 s7, s4, s5
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT:    s_and_b32 s5, s4, 1
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s3
+; GFX9DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT:    s_and_b32 s5, s4, 1
+; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s6, s5
+; GFX9DAGISEL-NEXT:    s_mul_i32 s7, s7, s5
+; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9DAGISEL-NEXT:    s_add_u32 s5, s5, s7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9GISEL-NEXT:    s_and_b32 s7, s6, 1
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s2, s7
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s7
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
+; GFX9GISEL-NEXT:    s_add_u32 s7, s2, s3
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s8, s4
+; GFX9GISEL-NEXT:    s_mul_i32 s5, s9, s4
+; GFX9GISEL-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX9GISEL-NEXT:    s_add_u32 s7, s4, s5
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s8, s8, 1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s9, s2, s8
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
+; GFX1064DAGISEL-NEXT:    s_add_u32 s9, s9, s3
+; GFX1064DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s5, s7, s4
+; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s7, s6, s4
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s6, s4
+; GFX1064DAGISEL-NEXT:    s_add_u32 s5, s7, s5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1064DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1064GISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s5, s7, s4
+; GFX1064GISEL-NEXT:    s_mul_hi_u32 s7, s6, s4
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
+; GFX1064GISEL-NEXT:    s_add_u32 s7, s7, s5
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s5, s2, s4
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s4
+; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s5, s3
+; GFX1032DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s5, s7, s3
+; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s7, s6, s3
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s6, s3
+; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s7, s5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1032GISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1032GISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s4, s7, s3
+; GFX1032GISEL-NEXT:    s_mul_hi_u32 s5, s6, s3
+; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
+; GFX1032GISEL-NEXT:    s_add_u32 s7, s5, s4
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1164DAGISEL-NEXT:    s_and_b32 s8, s8, 1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s9, s2, s8
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
+; GFX1164DAGISEL-NEXT:    s_add_u32 s9, s9, s3
+; GFX1164DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s5, s5, s6
+; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s7, s4, s6
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_add_u32 s5, s7, s5
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1164GISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s5, s5, s6
+; GFX1164GISEL-NEXT:    s_mul_hi_u32 s7, s4, s6
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s6
+; GFX1164GISEL-NEXT:    s_add_u32 s7, s7, s5
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132DAGISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1132DAGISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1132DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s5, s5, s3
+; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s6, s4, s3
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s4, s4, s3
+; GFX1132DAGISEL-NEXT:    s_add_u32 s5, s6, s5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT:  ; %bb.4: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132GISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, s6
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1132GISEL-NEXT:    s_add_u32 s7, s7, s3
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s5, s5, s3
+; GFX1132GISEL-NEXT:    s_mul_hi_u32 s7, s4, s3
+; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
+; GFX1132GISEL-NEXT:    s_add_u32 s7, s7, s5
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i64 %combine, ptr addrspace(1) %out
+  ret void
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10DAGISEL: {{.*}}
 ; GFX10GISEL: {{.*}}

>From 160fd1ff09022b086c6b71ef0f99d372ef01ee01 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Thu, 24 Jul 2025 17:01:24 +0530
Subject: [PATCH 2/4] Removing redundant variables.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 134 ++++++++++------------
 1 file changed, 62 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1a2c614b09ca9..9afc4fea18451 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5176,23 +5176,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
       unsigned CountReg =
           IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
 
-      auto Exec =
           BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
 
-      auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
-                                .addReg(Exec->getOperand(0).getReg());
+          auto NewAccumulator =
+              BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
+                  .addReg(ExecMask);
 
-      switch (Opc) {
-      case AMDGPU::S_XOR_B32:
-      case AMDGPU::S_XOR_B64: {
-        // Performing an XOR operation on a uniform value
-        // depends on the parity of the number of active lanes.
-        // For even parity, the result will be 0, for odd
-        // parity the result will be the same as the input value.
-        Register ParityRegister =
-            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+          switch (Opc) {
+          case AMDGPU::S_XOR_B32:
+          case AMDGPU::S_XOR_B64: {
+            // Performing an XOR operation on a uniform value
+            // depends on the parity of the number of active lanes.
+            // For even parity, the result will be 0, for odd
+            // parity the result will be the same as the input value.
+            Register ParityRegister =
+                MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
-        auto ParityReg =
             BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
                 .addReg(NewAccumulator->getOperand(0).getReg())
                 .addImm(1)
@@ -5200,7 +5199,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         if (is32BitOpc) {
           BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
               .addReg(SrcReg)
-              .addReg(ParityReg->getOperand(0).getReg());
+              .addReg(ParityRegister);
           break;
         } else {
           Register DestSub0 =
@@ -5223,15 +5222,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
 
           BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
               .add(Op1L)
-              .addReg(ParityReg->getOperand(0).getReg());
+              .addReg(ParityRegister);
 
           BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
               .add(Op1H)
-              .addReg(ParityReg->getOperand(0).getReg());
+              .addReg(ParityRegister);
 
           BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
               .add(Op1L)
-              .addReg(ParityReg->getOperand(0).getReg());
+              .addReg(ParityRegister);
 
           BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
               .addReg(CarryReg)
@@ -5250,12 +5249,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
 
         // Take the negation of the source operand.
-        auto InvertedValReg =
-            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
-                .addImm(-1)
-                .addReg(SrcReg);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
+            .addImm(-1)
+            .addReg(SrcReg);
         BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
-            .addReg(InvertedValReg->getOperand(0).getReg())
+            .addReg(NegatedVal)
             .addReg(NewAccumulator->getOperand(0).getReg());
         break;
       }
@@ -5294,14 +5292,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
               .addReg(NewAccumulator->getOperand(0).getReg())
               .addImm(-1);
 
-          MachineInstr *NegatedHi =
               BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
                   .addReg(NegatedValLo)
                   .addImm(31)
                   .setOperandDead(3); // Dead scc
-          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
-              .add(Op1L)
-              .addReg(NegatedHi->getOperand(0).getReg());
+              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
+                  .add(Op1L)
+                  .addReg(NegatedValHi);
         }
         Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
                                  ? NegatedValLo
@@ -5374,17 +5371,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     // Create initial values of induction variable from Exec, Accumulator and
     // insert branch instr to newly created ComputeBlock
     uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
-    auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator)
-                       .addReg(ExecReg);
+    BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
     if (is32BitOpc) {
       BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
           .addImm(IdentityValue);
     } else {
       Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
       Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      MachineInstr *IdenHi =
-          BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
-              .addImm(IdentityValue);
+      BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
+          .addImm(IdentityValue);
       switch (Opc) {
       case AMDGPU::V_CMP_LT_U64_e64:
       case AMDGPU::V_CMP_LT_I64_e64:
@@ -5395,14 +5390,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         IdentityValue = int32_t(0); // u|max
         break;
       }
-      MachineInstr *IdenLo =
           BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
               .addImm(IdentityValue);
-      BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
-          .addReg(IdenLo->getOperand(0).getReg())
-          .addImm(AMDGPU::sub0)
-          .addReg(IdenHi->getOperand(0).getReg())
-          .addImm(AMDGPU::sub1);
+          BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+                  IdentityValReg)
+              .addReg(Identitylo)
+              .addImm(AMDGPU::sub0)
+              .addReg(Identityhi)
+              .addImm(AMDGPU::sub1);
     }
     // clang-format off
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
@@ -5417,24 +5412,23 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             .addMBB(&BB);
     auto ActiveBits =
         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
-            .addReg(TmpSReg->getOperand(0).getReg())
+            .addReg(LoopIterator)
             .addMBB(&BB);
 
     I = ComputeLoop->end();
     MachineInstr *NewAccumulator;
     // Perform the computations
     unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
-    auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
-                   .addReg(ActiveBits->getOperand(0).getReg());
+    BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+        .addReg(ActiveBitsReg);
     if (is32BitOpc) {
-      MachineInstr *LaneValue =
-          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
-                  LaneValueReg)
-              .addReg(SrcReg)
-              .addReg(FF1->getOperand(0).getReg());
+      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+              LaneValueReg)
+          .addReg(SrcReg)
+          .addReg(FF1Reg);
       NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
                            .addReg(Accumulator->getOperand(0).getReg())
-                           .addReg(LaneValue->getOperand(0).getReg());
+                           .addReg(LaneValueReg);
     } else {
       Register LaneValueLoReg =
           MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -5453,17 +5447,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
           BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
                   LaneValueLoReg)
               .add(Op1L)
-              .addReg(FF1->getOperand(0).getReg());
+              .addReg(FF1Reg);
       MachineInstr *LaneValueHi =
           BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
                   LaneValueHiReg)
               .add(Op1H)
-              .addReg(FF1->getOperand(0).getReg());
+              .addReg(FF1Reg);
       auto LaneValue = BuildMI(*ComputeLoop, I, DL,
                                TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
-                           .addReg(LaneValueLo->getOperand(0).getReg())
+                           .addReg(LaneValueLoReg)
                            .addImm(AMDGPU::sub0)
-                           .addReg(LaneValueHi->getOperand(0).getReg())
+                           .addReg(LaneValueHiReg)
                            .addImm(AMDGPU::sub1);
       switch (Opc) {
       case ::AMDGPU::S_OR_B64:
@@ -5500,14 +5494,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             .addImm(AMDGPU::sub0)
             .add(SrcReg0Sub1)
             .addImm(AMDGPU::sub1);
-        auto LaneMask = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
-                            .addReg(LaneValue->getOperand(0).getReg())
-                            .addReg(AccumulatorVReg);
+        BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
+            .addReg(LaneValue->getOperand(0).getReg())
+            .addReg(AccumulatorVReg);
 
         unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
         BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
-            .addReg(LaneMask->getOperand(0).getReg())
-            .addReg(ActiveBits->getOperand(0).getReg());
+            .addReg(LaneMaskReg)
+            .addReg(ActiveBitsReg);
 
         NewAccumulator = BuildMI(*ComputeLoop, I, DL,
                                  TII->get(AMDGPU::S_CSELECT_B64), DstReg)
@@ -5529,19 +5523,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
             MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
             &AMDGPU::SReg_32RegClass);
-        MachineInstr *DestLoComputation =
-            BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
-                .add(Accumlo)
-                .addReg(LaneValueLo->getOperand(0).getReg());
-        MachineInstr *DestHiComputation =
-            BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
-                .add(Accumhi)
-                .addReg(LaneValueHi->getOperand(0).getReg());
+        BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
+            .add(Accumlo)
+            .addReg(LaneValueLo->getOperand(0).getReg());
+        BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
+            .add(Accumhi)
+            .addReg(LaneValueHi->getOperand(0).getReg());
         NewAccumulator = BuildMI(*ComputeLoop, I, DL,
                                  TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
-                             .addReg(DestLoComputation->getOperand(0).getReg())
+                             .addReg(DestLo)
                              .addImm(AMDGPU::sub0)
-                             .addReg(DestHiComputation->getOperand(0).getReg())
+                             .addReg(DestHi)
                              .addImm(AMDGPU::sub1);
         break;
       }
@@ -5550,21 +5542,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     // Manipulate the iterator to get the next active lane
     unsigned BITSETOpc =
         IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
-    auto NewActiveBits =
-        BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
-            .addReg(FF1->getOperand(0).getReg())
-            .addReg(ActiveBits->getOperand(0).getReg());
+    BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+        .addReg(FF1Reg)
+        .addReg(ActiveBitsReg);
 
     // Add phi nodes
     Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
         .addMBB(ComputeLoop);
-    ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
-        .addMBB(ComputeLoop);
+    ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
 
     // Creating branching
     unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
     BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
-        .addReg(NewActiveBits->getOperand(0).getReg())
+        .addReg(NewActiveBitsReg)
         .addImm(0);
     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
         .addMBB(ComputeLoop);

>From 02d585140a0831646fd6f9443b153975774f1cba Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Thu, 24 Jul 2025 17:19:41 +0530
Subject: [PATCH 3/4] Getting negated value using S_SUB_I32 in place of
 S_MUL_I32.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  27 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 120 +++++++++---------
 2 files changed, 73 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9afc4fea18451..55e48e97e00c2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5244,13 +5244,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
               .addImm(AMDGPU::sub1);
           break;
         }
-      }
+          }
       case AMDGPU::S_SUB_I32: {
         Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
 
         // Take the negation of the source operand.
-        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
-            .addImm(-1)
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
+            .addImm(0)
             .addReg(SrcReg);
         BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
             .addReg(NegatedVal)
@@ -5288,17 +5288,16 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
 
         if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
-          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedValLo)
-              .addReg(NewAccumulator->getOperand(0).getReg())
-              .addImm(-1);
-
-              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
-                  .addReg(NegatedValLo)
-                  .addImm(31)
-                  .setOperandDead(3); // Dead scc
-              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
-                  .add(Op1L)
-                  .addReg(NegatedValHi);
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
+              .addImm(0)
+              .addReg(NewAccumulator->getOperand(0).getReg());
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
+              .addReg(NegatedValLo)
+              .addImm(31)
+              .setOperandDead(3); // Dead scc
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
+              .add(Op1L)
+              .addReg(NegatedValHi);
         }
         Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
                                  ? NegatedValLo
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index aff5c01ee174b..3d7be58c42980 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX8DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -35,7 +35,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX8GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
@@ -51,7 +51,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX9DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -65,7 +65,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX9GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -80,7 +80,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -95,7 +95,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1064GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1064GISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -110,7 +110,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s2, 0, s2
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -125,7 +125,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1032GISEL-NEXT:    s_sub_i32 s2, 0, s2
 ; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s3
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -140,7 +140,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
@@ -156,7 +156,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1164GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -172,7 +172,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s2, 0, s2
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
@@ -188,7 +188,7 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1132GISEL-NEXT:    s_sub_i32 s2, 0, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s3
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -792,7 +792,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX8DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX8DAGISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
@@ -833,7 +833,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX8GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s3, s2
 ; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -871,7 +871,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX9DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX9DAGISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
@@ -911,7 +911,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX9GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s3, s2
 ; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -948,7 +948,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX1064DAGISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
@@ -988,7 +988,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1064GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s3, s2
 ; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -1025,7 +1025,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, -1
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s1, 0, s1
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
 ; GFX1032DAGISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
@@ -1065,7 +1065,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, -1
+; GFX1032GISEL-NEXT:    s_sub_i32 s0, 0, s0
 ; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
@@ -1105,7 +1105,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX1164DAGISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
@@ -1149,7 +1149,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1164GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s3, s2
 ; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
@@ -1190,7 +1190,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, -1
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s1, 0, s1
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
 ; GFX1132DAGISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
@@ -1234,7 +1234,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, -1
+; GFX1132GISEL-NEXT:    s_sub_i32 s0, 0, s0
 ; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
@@ -1282,7 +1282,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX8DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX8DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX8DAGISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8DAGISEL-NEXT:    s_ashr_i32 s0, s4, 31
@@ -1303,7 +1303,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX8GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX8GISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX8GISEL-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX8GISEL-NEXT:    s_ashr_i32 s4, s5, 31
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s2, s4
@@ -1324,7 +1324,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9DAGISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX9DAGISEL-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX9DAGISEL-NEXT:    s_ashr_i32 s4, s5, 31
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s6, s2, s4
@@ -1344,7 +1344,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9GISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX9GISEL-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX9GISEL-NEXT:    s_ashr_i32 s4, s5, 31
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s2, s4
@@ -1365,7 +1365,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1064DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
@@ -1385,7 +1385,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1064GISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1064GISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
@@ -1405,7 +1405,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1032DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
@@ -1425,7 +1425,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX1032GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1032GISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1032GISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
@@ -1446,7 +1446,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1164DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
@@ -1467,7 +1467,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1164GISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1164GISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
@@ -1487,7 +1487,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX1132DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1509,7 +1509,7 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1132GISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1132GISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_hi_u32 s6, s2, s4
@@ -2309,7 +2309,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8DAGISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX8DAGISEL-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX8DAGISEL-NEXT:    s_ashr_i32 s6, s7, 31
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s10, s2, s6
@@ -2328,7 +2328,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8DAGISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX8DAGISEL-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX8DAGISEL-NEXT:    s_ashr_i32 s6, s7, 31
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s8, s4, s6
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s4, s7
@@ -2356,7 +2356,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8GISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX8GISEL-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX8GISEL-NEXT:    s_ashr_i32 s6, s7, 31
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s10, s2, s6
@@ -2373,7 +2373,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8GISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX8GISEL-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX8GISEL-NEXT:    s_ashr_i32 s6, s7, 31
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s8, s4, s6
@@ -2403,7 +2403,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX9DAGISEL-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX9DAGISEL-NEXT:    s_ashr_i32 s4, s5, 31
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s10, s2, s4
@@ -2422,7 +2422,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s4, -1
+; GFX9DAGISEL-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX9DAGISEL-NEXT:    s_ashr_i32 s4, s5, 31
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s8, s6, s4
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s6, s5
@@ -2449,7 +2449,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX9GISEL-NEXT:    s_mul_i32 s7, s6, -1
+; GFX9GISEL-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX9GISEL-NEXT:    s_ashr_i32 s6, s7, 31
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s10, s2, s6
@@ -2466,7 +2466,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX9GISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX9GISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s8, s4
@@ -2496,7 +2496,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
-; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s8, -1
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s8, 0, s8
 ; GFX1064DAGISEL-NEXT:    s_ashr_i32 s9, s8, 31
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s10, s2, s8
@@ -2514,7 +2514,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1064DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1064DAGISEL-NEXT:    s_mul_hi_u32 s8, s6, s4
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s7, s7, s4
@@ -2541,7 +2541,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1064GISEL-NEXT:    s_sub_i32 s6, 0, s6
 ; GFX1064GISEL-NEXT:    s_ashr_i32 s7, s6, 31
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_hi_u32 s10, s2, s6
@@ -2558,7 +2558,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064GISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1064GISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1064GISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_hi_u32 s8, s6, s4
@@ -2588,7 +2588,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s4, -1
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s4, 0, s4
 ; GFX1032DAGISEL-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s9, s2, s4
@@ -2606,7 +2606,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, -1
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s3, 0, s3
 ; GFX1032DAGISEL-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX1032DAGISEL-NEXT:    s_mul_hi_u32 s5, s6, s3
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s7, s7, s3
@@ -2633,7 +2633,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
-; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1032GISEL-NEXT:    s_sub_i32 s6, 0, s6
 ; GFX1032GISEL-NEXT:    s_ashr_i32 s7, s6, 31
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_hi_u32 s9, s2, s6
@@ -2650,7 +2650,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, -1
+; GFX1032GISEL-NEXT:    s_sub_i32 s3, 0, s3
 ; GFX1032GISEL-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_hi_u32 s5, s6, s3
@@ -2683,7 +2683,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
-; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s8, -1
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s8, 0, s8
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ashr_i32 s9, s8, 31
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2704,7 +2704,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s6, 0, s6
 ; GFX1164DAGISEL-NEXT:    s_ashr_i32 s7, s6, 31
 ; GFX1164DAGISEL-NEXT:    s_mul_hi_u32 s8, s4, s6
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s5, s5, s6
@@ -2734,7 +2734,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1164GISEL-NEXT:    s_sub_i32 s6, 0, s6
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ashr_i32 s7, s6, 31
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2753,7 +2753,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1164GISEL-NEXT:    s_sub_i32 s6, 0, s6
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ashr_i32 s7, s6, 31
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2787,7 +2787,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
-; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s6, 0, s6
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ashr_i32 s7, s6, 31
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2807,7 +2807,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, -1
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s3, 0, s3
 ; GFX1132DAGISEL-NEXT:    s_ashr_i32 s6, s3, 31
 ; GFX1132DAGISEL-NEXT:    s_mul_hi_u32 s7, s4, s3
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s5, s5, s3
@@ -2837,7 +2837,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
-; GFX1132GISEL-NEXT:    s_mul_i32 s6, s6, -1
+; GFX1132GISEL-NEXT:    s_sub_i32 s6, 0, s6
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ashr_i32 s7, s6, 31
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2856,7 +2856,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, -1
+; GFX1132GISEL-NEXT:    s_sub_i32 s3, 0, s3
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ashr_i32 s6, s3, 31
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)

>From 67c137d578c98938860c19b0be9bd347be913ea0 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Fri, 25 Jul 2025 14:53:34 +0530
Subject: [PATCH 4/4] Running Clang Format

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 111 +++++++++++-----------
 1 file changed, 55 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 55e48e97e00c2..81dec3c9205b1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5196,54 +5196,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                 .addReg(NewAccumulator->getOperand(0).getReg())
                 .addImm(1)
                 .setOperandDead(3); // Dead scc
-        if (is32BitOpc) {
-          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
-              .addReg(SrcReg)
-              .addReg(ParityRegister);
-          break;
-        } else {
-          Register DestSub0 =
-              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-          Register DestSub1 =
-              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-          Register Op1H_Op0L_Reg =
-              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-          Register CarryReg =
-              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-
-          const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
-          const TargetRegisterClass *SrcSubRC =
-              TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
-
-          MachineOperand Op1L = TII->buildExtractSubRegOrImm(
-              MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
-          MachineOperand Op1H = TII->buildExtractSubRegOrImm(
-              MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
-
-          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
-              .add(Op1L)
-              .addReg(ParityRegister);
-
-          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
-              .add(Op1H)
-              .addReg(ParityRegister);
-
-          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
-              .add(Op1L)
-              .addReg(ParityRegister);
-
-          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
-              .addReg(CarryReg)
-              .addReg(Op1H_Op0L_Reg)
-              .setOperandDead(3); // Dead scc
-
-          BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
-              .addReg(DestSub0)
-              .addImm(AMDGPU::sub0)
-              .addReg(DestSub1)
-              .addImm(AMDGPU::sub1);
-          break;
-        }
+            if (is32BitOpc) {
+              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+                  .addReg(SrcReg)
+                  .addReg(ParityRegister);
+              break;
+            } else {
+              Register DestSub0 =
+                  MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+              Register DestSub1 =
+                  MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+              Register Op1H_Op0L_Reg =
+                  MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+              Register CarryReg =
+                  MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+              const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+              const TargetRegisterClass *SrcSubRC =
+                  TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+              MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+                  MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+              MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+                  MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+
+              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+                  .add(Op1L)
+                  .addReg(ParityRegister);
+
+              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
+                  .add(Op1H)
+                  .addReg(ParityRegister);
+
+              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
+                  .add(Op1L)
+                  .addReg(ParityRegister);
+
+              BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
+                  .addReg(CarryReg)
+                  .addReg(Op1H_Op0L_Reg)
+                  .setOperandDead(3); // Dead scc
+
+              BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+                  .addReg(DestSub0)
+                  .addImm(AMDGPU::sub0)
+                  .addReg(DestSub1)
+                  .addImm(AMDGPU::sub1);
+              break;
+            }
           }
       case AMDGPU::S_SUB_I32: {
         Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
@@ -5389,14 +5389,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         IdentityValue = int32_t(0); // u|max
         break;
       }
-          BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
-              .addImm(IdentityValue);
-          BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
-                  IdentityValReg)
-              .addReg(Identitylo)
-              .addImm(AMDGPU::sub0)
-              .addReg(Identityhi)
-              .addImm(AMDGPU::sub1);
+      BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
+          .addImm(IdentityValue);
+      BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
+          .addReg(Identitylo)
+          .addImm(AMDGPU::sub0)
+          .addReg(Identityhi)
+          .addImm(AMDGPU::sub1);
     }
     // clang-format off
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))



More information about the llvm-commits mailing list