[llvm] [AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions (PR #140694)

Thu Jun 19 04:43:00 PDT 2025

https://github.com/chrisjbris updated https://github.com/llvm/llvm-project/pull/140694

>From ca612294c82774572ab0cf3e90df753199affb7a Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 20 May 2025 05:14:36 -0500
Subject: [PATCH] [AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make
 use of 64-bit wide instructions

Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these
causes a number of test regressions, so extra work in the combiner and
Tablegen patterns was necessary.

- Use custom for v2i32 rotr instead of additional patterns. Modify
PerformOrCombine() to remove some identity or operations

- Fix rotr regression by adding lowerRotr() on the legalizer codepath.

- Add test case to rotr.ll

- Extend performFNEGCombine() for the SELECT case.

- Modify performSelectCombine() and foldFreeOpFromSelect to prevent the
performFNEGCombine() changes from being unwound.

- Add cases to or.ll and xor.ll to demonstrate the generation of the
  s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously
  this was inhibited by "-amdgpu-scalarize-global-loads=false".

- Fix sdhl64_reduce regression by performing the scalarisation
previously performewd by the vector legaliser in performShlCombine().

- Update test changes - not regressions.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 105 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  96 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  47 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  15 +
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |   8 +-
 llvm/test/CodeGen/AMDGPU/and.ll               |   3 +-
 llvm/test/CodeGen/AMDGPU/bf16-conversions.ll  |  24 +-
 llvm/test/CodeGen/AMDGPU/bfi_int.ll           |   4 +-
 .../AMDGPU/copysign-simplify-demanded-bits.ll |   4 +-
 .../AMDGPU/dag-preserve-disjoint-flag.ll      |  36 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |  16 +-
 llvm/test/CodeGen/AMDGPU/fshr.ll              | 188 ++---
 llvm/test/CodeGen/AMDGPU/or.ll                | 677 +++++++++++++++++-
 llvm/test/CodeGen/AMDGPU/rotr.ll              | 128 ++++
 llvm/test/CodeGen/AMDGPU/shl64_reduce.ll      |   2 +-
 .../CodeGen/AMDGPU/vector_range_metadata.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/xor.ll               | 630 +++++++++++++++-
 18 files changed, 1769 insertions(+), 223 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c51cc2a2fe529..06c80a9cd794d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4025,9 +4025,9 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
-  DAGCombinerInfo &DCI, const SDLoc &SL,
-  unsigned Opc, SDValue LHS,
-  uint32_t ValLo, uint32_t ValHi) const {
+    DAGCombinerInfo &DCI, const SDLoc &SL,
+     unsigned Opc, SDValue LHS,
+    uint32_t ValLo, uint32_t ValHi) const {
   SelectionDAG &DAG = DCI.DAG;
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4056,6 +4056,57 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
 
+  // When the shl64_reduce optimisation code is passed through vector
+  // legalization //some scalarising occurs. After ISD::AND was legalised, this
+  // resulted in the AND instructions no longer being elided, as mentioned
+  // below. The following code should make sure this takes place.
+  // ConstantSDNode *CVANDRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
+  if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue VAND = RHS.getOperand(0);
+    ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
+    uint64_t AndIndex = RHS->getConstantOperandVal(1);
+    if (VAND->getOpcode() == ISD::AND && CRRHS) {
+      SDValue LHSAND = VAND.getOperand(0);
+      SDValue RHSAND = VAND.getOperand(1);
+      if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
+        // Part of shlcombine is to optimise for the case where its possible
+        // to reduce shl64 to shl32 if shift range is [63-32]. This
+        // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 32) ]. The
+        // '&' is then elided by ISel. The vector code for this was being
+        // completely scalarised by the vector legalizer, but now v2i32 is
+        // made legal the vector legaliser only partially scalarises the
+        // vector operations and the and was not elided. This check enables us
+        // to locate and scalarise the vwi32 and and re-enable ISel to elide
+        // the and instruction.
+        ConstantSDNode *CANDL = dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
+        ConstantSDNode *CANDR = dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
+        if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
+            RHSAND->getConstantOperandVal(1) == 0x1f) {
+          // Get the non-const AND operands and produce scalar AND
+          const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+          const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+          SDValue Lo =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, Zero);
+          SDValue Hi =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
+          SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
+          SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
+          SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+          if (AndIndex == 0) {
+            return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc, LoAnd,
+                               N->getFlags());
+          } else if (AndIndex == 1) {
+            return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc, HiAnd,
+                               N->getFlags());
+          } else {
+            return SDValue();
+          }
+        }
+    }
+  }
+  }
+
   unsigned RHSVal;
   if (CRHS) {
     RHSVal = CRHS->getZExtValue();
@@ -4097,8 +4148,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   if (VT.getScalarType() != MVT::i64)
     return SDValue();
 
-  // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
-
   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
   // common case, splitting this into a move and a 32-bit shift is faster and
   // the same code size.
@@ -4701,8 +4750,27 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
     if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
       return SDValue();
 
-    return distributeOpThroughSelect(DCI, LHS.getOpcode(),
-                                     SDLoc(N), Cond, LHS, RHS);
+    // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
+    // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
+    // out in this case. For now I've made the logic as specific to the case as
+    // possible, hopefully this can be relaxed in future.
+    if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
+      SDValue LHSB = LHS.getOperand(0);
+      SDValue RHSB = RHS.getOperand(0);
+      if (LHSB.getOpcode() == ISD::BITCAST &&
+          RHSB->getOpcode() == ISD::BITCAST) {
+        EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
+        EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
+        if (LHSB.getValueType() == MVT::f32 &&
+            RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
+            RHSBOpTy == MVT::i32) {
+          return SDValue();
+        }
+      }
+    }
+
+    return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
+                                     RHS);
   }
 
   bool Inv = false;
@@ -4755,8 +4823,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
       if (Inv)
         std::swap(NewLHS, NewRHS);
 
-      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
-                                      Cond, NewLHS, NewRHS);
+      SDValue NewSelect =
+          DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
       DCI.AddToWorklist(NewSelect.getNode());
       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
     }
@@ -5094,8 +5162,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   }
   case ISD::SELECT: {
     // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
+    // This combine became necessary recently to prevent a regression in
+    // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
+    // Specifically, additional instructions were added to the final codegen.
+    // When adding this combine a case was added to performFNEGCombine to
+    // prevent this combine from being undone under certain conditions.
     // TODO: Invert conditions of foldFreeOpFromSelect
-    return SDValue();
+    SDValue Cond = N0.getOperand(0);
+    SDValue LHS = N0.getOperand(1);
+    SDValue RHS = N0.getOperand(2);
+    EVT LHVT = LHS.getValueType();
+    EVT RHVT = RHS.getValueType();
+    // The regression was limited to i32 v2/i32.
+    if (RHVT != MVT::i32 && LHVT != MVT::i32)
+      return SDValue();
+
+    SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
+    SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
+    SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
+    return Op;
   }
   case ISD::BITCAST: {
     SDLoc SL(N);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 30535ae88f7ba..dfdb2c25f105c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -438,6 +438,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
   }
 
+  setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
+  // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
+  // instead lower to cndmask in SITargetLowering::LowerSELECT().
+  setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
+  // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
+  // alignbit.
+  setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
+
   setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
                      Custom);
 
@@ -5930,6 +5938,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
 }
 
+// Enable lowering of ROTR for vxi32 types. This is a workaround for a
+// regression whereby extra unnecessary instructions were added to codegen
+// for rotr operations, casued by legalising v2i32 or. This resulted in extra
+// instructions to extract the result from the vector.
+SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
+  [[maybe_unused]] EVT VT = Op.getValueType();
+
+  assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
+          VT == MVT::v16i32) &&
+         "Unexpected ValueType.");
+
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
 // wider vector type is legal.
 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6120,6 +6142,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerGET_FPENV(Op, DAG);
   case ISD::SET_FPENV:
     return lowerSET_FPENV(Op, DAG);
+  case ISD::ROTR:
+    return lowerROTR(Op, DAG);
   }
   return SDValue();
 }
@@ -12996,6 +13020,48 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
     }
   }
 
+  // Detect identity v2i32 OR and replace with identity source node.
+  // Specifically an Or that has operands constructed from the same source node
+  // via extract_vector_elt and build_vector. I.E.
+  // v2i32 or(
+  //   v2i32 build_vector(
+  //     i32 extract_elt(%IdentitySrc, 0),
+  //     i32 0
+  //   ),
+  //   v2i32 build_vector(
+  //     i32 0,
+  //     i32 extract_elt(%IdentitySrc, 1)
+  //   ) )
+  // =>
+  // v2i32 %IdentitySrc
+
+  if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
+      RHS->getOpcode() == ISD::BUILD_VECTOR) {
+
+    if (auto *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1)))
+      if (auto *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0))) {
+
+        // Test for and normalise build vectors.
+        if (LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
+
+          // Get the extract_vector_element operands.
+          SDValue LEVE = LHS->getOperand(0);
+          SDValue REVE = RHS->getOperand(1);
+
+          if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+              REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+            // Check that different elements from the same vector are
+            // extracted.
+            if (LEVE->getOperand(0) == REVE->getOperand(0) &&
+                LEVE->getOperand(1) != REVE->getOperand(1)) {
+              SDValue IdentitySrc = LEVE.getOperand(0);
+              return IdentitySrc;
+            }
+          }
+        }
+      }
+  }
+
   if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -13040,13 +13106,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
   if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
     return RV;
 
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
+  if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
+
+    const ConstantSDNode *CRHS_0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
+    const ConstantSDNode *CRHS_1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    SDValue LHS_0 = LHS.getOperand(0);
+    SDValue LHS_1 = LHS.getOperand(1);
+
+    if (LHS.getOpcode() == ISD::VSELECT) {
+      if (CRHS_0 && CRHS_0->getAPIntValue().isSignMask() &&
+          shouldFoldFNegIntoSrc(N, LHS_0))
+        if (CRHS_1 && CRHS_1->getAPIntValue().isSignMask() &&
+            shouldFoldFNegIntoSrc(N, LHS_1)) {
+          SDLoc DL(N);
+          SDValue CastLHS =
+              DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
+          SDValue CastRHS =
+              DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
+          SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
+          SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
+          SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
+                                          LHS->getOperand(0), FNegLHS, FNegRHS);
+          return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
+        }
+    }
+  }
+
   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
-  SelectionDAG &DAG = DCI.DAG;
 
-  EVT VT = N->getValueType(0);
   if (CRHS && VT == MVT::i64) {
     if (SDValue Split =
             splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d71a22722129e..9c6bfc1d86f95 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -441,6 +441,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
 
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1419f63202a7c..a116dc95fb4ba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1807,7 +1807,6 @@ def : GCNPat <
 >;
 }
 
-
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
 /********** ================================ **********/
@@ -2361,9 +2360,9 @@ def : AMDGPUPatIgnoreCopies <
                 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
 >;
 
-// 64-bit version
+foreach vt = [i64, v2i32] in {
 def : AMDGPUPatIgnoreCopies <
-  (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+  (DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
   (REG_SEQUENCE VReg_64,
     (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
               (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
@@ -2372,6 +2371,7 @@ def : AMDGPUPatIgnoreCopies <
               (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
 >;
+}
 
 def : AMDGPUPat <
   (fcopysign f32:$src0, f32:$src1),
@@ -2415,30 +2415,25 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
 } // end True16Predicate = NotHasTrue16BitInsts
 
 let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat <
-  (rotr i32:$src0, i32:$src1),
-  (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
-                          /* src1_modifiers */ 0, $src0,
-                          /* src2_modifiers */ 0,
-                          (EXTRACT_SUBREG $src1, lo16),
-                          /* clamp */ 0, /* op_sel */ 0)
->;
-
-def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
-          (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
-                          (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
-                          0, /* src1_modifiers */
-                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
-                          0, /* src2_modifiers */
-                          (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)),
-                          /* clamp */ 0, /* op_sel */ 0)>;
+  def : GCNPat<(rotr i32:$src0, i32:$src1),
+               (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
+                   /* src1_modifiers */ 0, $src0,
+                   /* src2_modifiers */ 0, (EXTRACT_SUBREG $src1, lo16),
+                   /* clamp */ 0, /* op_sel */ 0)>;
 
-def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
-          (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
-                          /* src1_modifiers */ 0, $src1,
-                          /* src2_modifiers */ 0,
-                          (EXTRACT_SUBREG VGPR_32:$src2, lo16),
-                          /* clamp */ 0, /* op_sel */ 0)>;
+  def : GCNPat<
+            (i32(trunc(srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+            (V_ALIGNBIT_B32_t16_e64 0,                     /* src0_modifiers */
+                (i32(EXTRACT_SUBREG(i64 $src0), sub1)), 0, /* src1_modifiers */
+                (i32(EXTRACT_SUBREG(i64 $src0), sub0)), 0, /* src2_modifiers */
+                (i16(EXTRACT_SUBREG VGPR_32:$src1, lo16)),
+                /* clamp */ 0, /* op_sel */ 0)>;
+
+  def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+               (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
+                   /* src1_modifiers */ 0, $src1,
+                   /* src2_modifiers */ 0, (EXTRACT_SUBREG VGPR_32:$src2, lo16),
+                   /* clamp */ 0, /* op_sel */ 0)>;
 } // end True16Predicate = UseRealTrue16Insts
 
 let True16Predicate = UseFakeTrue16Insts in {
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e0a36758534d5..473dbd6ec54cb 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1779,6 +1779,21 @@ def : GCNPat <
   (S_MOV_B32 imm:$imm)
 >;
 
+def : GCNPat <
+  (v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
+  (S_AND_B64 SReg_64:$x, SReg_64:$y)
+>;
+
+def : GCNPat <
+  (v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
+  (S_OR_B64 SReg_64:$x, SReg_64:$y)
+>;
+
+def : GCNPat <
+  (v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
+  (S_XOR_B64 SReg_64:$x, SReg_64:$y)
+>;
+
 // Same as a 32-bit inreg
 def : GCNPat<
   (i32 (UniformUnaryFrag<sext> i16:$src)),
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..efa9c465f794e 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -954,9 +954,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
 def : DivergentBinOp<adde, V_ADDC_U32_e32>;
 def : DivergentBinOp<sube, V_SUBB_U32_e32>;
 
-class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
+class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
   GCNPat<
-      (DivergentBinFrag<Op> i64:$src0, i64:$src1),
+      (DivergentBinFrag<Op> vt:$src0, vt:$src1),
       (REG_SEQUENCE VReg_64,
         (Inst
           (i32 (EXTRACT_SUBREG $src0, sub0)),
@@ -973,6 +973,10 @@ def :  divergent_i64_BinOp <and, V_AND_B32_e64>;
 def :  divergent_i64_BinOp <or,  V_OR_B32_e64>;
 def :  divergent_i64_BinOp <xor, V_XOR_B32_e64>;
 
+def :  divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
+def :  divergent_i64_BinOp <or,  V_OR_B32_e64, v2i32>;
+def :  divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;
+
 // mul24 w/ 64 bit output.
 class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
   (i64 (Op i32:$src0, i32:$src1)),
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index c6233642110ea..05402b3c89409 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -8,8 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
-; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b64
 
 define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..ca8f7736f6093 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -151,25 +151,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
 ; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
 ; GFX-950:       ; %bb.0:
 ; GFX-950-NEXT:    v_cvt_f32_f64_e32 v6, v[2:3]
+; GFX-950-NEXT:    v_and_b32_e32 v4, 1, v6
+; GFX-950-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT:    v_and_b32_e32 v7, 1, v6
 ; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
-; GFX-950-NEXT:    v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
-; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v7
+; GFX-950-NEXT:    v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5]
+; GFX-950-NEXT:    v_cvt_f32_f64_e32 v7, v[0:1]
 ; GFX-950-NEXT:    v_cndmask_b32_e64 v2, -1, 1, s[2:3]
 ; GFX-950-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX-950-NEXT:    s_or_b64 vcc, vcc, s[0:1]
-; GFX-950-NEXT:    v_cvt_f32_f64_e32 v5, v[0:1]
+; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
 ; GFX-950-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GFX-950-NEXT:    v_and_b32_e32 v6, 1, v5
+; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
+; GFX-950-NEXT:    v_and_b32_e32 v8, 1, v7
 ; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
-; GFX-950-NEXT:    v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
-; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v6
+; GFX-950-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GFX-950-NEXT:    v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3]
 ; GFX-950-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT:    v_add_u32_e32 v0, v5, v0
-; GFX-950-NEXT:    s_or_b64 vcc, vcc, s[0:1]
-; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX-950-NEXT:    v_add_u32_e32 v0, v7, v0
+; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v4
 ; GFX-950-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x double> %src to <2 x bfloat>
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index b372dec383344..987555fbaaafb 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -582,15 +582,15 @@ define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %
 ; GFX7-LABEL: v_bitselect_v2i32_pat1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bitselect_v2i32_pat1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bitselect_v2i32_pat1:
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 021104114d796..f5227eed458d6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -31,8 +31,8 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
 ; GFX9-LABEL: test_pown_reduced_fast_v2f16_known_odd:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
@@ -126,8 +126,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_or_b32_e32 v6, 1, v5
+; GFX9-NEXT:    v_or_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_cvt_f64_i32_e32 v[4:5], v4
 ; GFX9-NEXT:    v_cvt_f64_i32_e32 v[6:7], v6
 ; GFX9-NEXT:    s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
index d63a36c4b2958..7e2e8b577e085 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
@@ -28,12 +28,15 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_1]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
   ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
   ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
@@ -64,10 +67,23 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
-  ; CHECK-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+  ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY5]], killed [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+  ; CHECK-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY7]], killed [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY8]]
+  ; CHECK-NEXT:   $vgpr1 = COPY [[COPY9]]
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   %result = or disjoint <2 x i32> %a, %b
   ret <2 x i32> %result
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 1b092b283290a..ea662f299e76a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1645,12 +1645,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_cselect_b32 s1, s1, s3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v0, -v1, vcc
 ; GFX7-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1669,10 +1669,10 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, s1, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v0, -v1, vcc
 ; GFX9-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
@@ -1683,17 +1683,17 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x10
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x18
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-NEXT:    s_bitcmp1_b32 s6, 0
 ; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -s3, -v0, vcc_lo
 ; GFX11-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s1, s1, s3
 ; GFX11-NEXT:    s_cselect_b32 s0, s0, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, -v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, s1, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 4a79096442c96..7afd99ddb0ef6 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -2010,61 +2010,61 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; SI-LABEL: v_fshr_v2i24:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; SI-NEXT:    s_mov_b32 s4, 0xaaaaaab
-; SI-NEXT:    v_mul_hi_u32 v6, v6, s4
-; SI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
+; SI-NEXT:    v_mul_hi_u32 v6, v4, s4
+; SI-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
-; SI-NEXT:    v_mul_hi_u32 v6, v7, s4
+; SI-NEXT:    v_mul_hi_u32 v6, v5, s4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
-; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; SI-NEXT:    v_mul_u32_u24_e32 v2, 24, v6
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
+; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v2i24:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
+; VI-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; VI-NEXT:    s_mov_b32 s4, 0xaaaaaab
-; VI-NEXT:    v_mul_hi_u32 v6, v6, s4
-; VI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
+; VI-NEXT:    v_mul_hi_u32 v6, v4, s4
+; VI-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; VI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
-; VI-NEXT:    v_mul_hi_u32 v6, v7, s4
+; VI-NEXT:    v_mul_hi_u32 v6, v5, s4
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
 ; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
-; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v3
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
-; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; VI-NEXT:    v_mul_u32_u24_e32 v2, 24, v6
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v5, v2
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
+; VI-NEXT:    v_alignbit_b32 v1, v1, v3, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i24:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaab
-; GFX9-NEXT:    v_mul_hi_u32 v6, v6, s4
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
+; GFX9-NEXT:    v_mul_hi_u32 v6, v4, s4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, v7, s4
+; GFX9-NEXT:    v_mul_hi_u32 v6, v5, s4
 ; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
-; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
-; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; GFX9-NEXT:    v_mul_u32_u24_e32 v2, 24, v6
+; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
+; GFX9-NEXT:    v_alignbit_b32 v1, v1, v3, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v2i24:
@@ -2075,12 +2075,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX10-LABEL: v_fshr_v2i24:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX10-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
+; GFX10-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX10-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v5
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
@@ -2091,109 +2091,29 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: v_fshr_v2i24:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX11-TRUE16-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
-; GFX11-TRUE16-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 8, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 8, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, v0, v2, v4.l
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, v1, v3, v5.l
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_fshr_v2i24:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX11-FAKE16-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
-; GFX11-FAKE16-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 8, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 8, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v0, v2, v4
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v1, v3, v5
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: v_fshr_v2i24:
-; GFX12-TRUE16:       ; %bb.0:
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX12-TRUE16-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
-; GFX12-TRUE16-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
-; GFX12-TRUE16-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 8, v4
-; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 8, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, v0, v2, v4.l
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, v1, v3, v5.l
-; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: v_fshr_v2i24:
-; GFX12-FAKE16:       ; %bb.0:
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX12-FAKE16-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
-; GFX12-FAKE16-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
-; GFX12-FAKE16-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 8, v4
-; GFX12-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 8, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, v0, v2, v4
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, v1, v3, v5
-; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_fshr_v2i24:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX11-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
+; GFX11-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 8, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
   ret <2 x i24> %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 1abd2e6b60f2f..26751b289a385 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -1,8 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=true  -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6S %s
+; RUN:  llc -amdgpu-scalarize-global-loads=true  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8S %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
 
+;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_or_b64, particularly in the v2i32 case. See SWDEV-517886.
+;; Also removed the previously unused "GCN" check-prefixes from the test.
+
 define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX6-LABEL: or_v2i32:
 ; GFX6:       ; %bb.0:
@@ -18,8 +23,8 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -37,11 +42,39 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX8-NEXT:    s_mov_b32 s4, s0
 ; GFX8-NEXT:    s_mov_b32 s5, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: or_v2i32:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: or_v2i32:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: or_v2i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -112,6 +145,44 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: or_v4i32:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX6S-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s10, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s3, s3, s7
+; GFX6S-NEXT:    s_or_b32 s2, s2, s6
+; GFX6S-NEXT:    s_or_b32 s1, s1, s5
+; GFX6S-NEXT:    s_or_b32 s0, s0, s4
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6S-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6S-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6S-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: or_v4i32:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX8S-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s10, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s3, s3, s7
+; GFX8S-NEXT:    s_or_b32 s2, s2, s6
+; GFX8S-NEXT:    s_or_b32 s1, s1, s5
+; GFX8S-NEXT:    s_or_b32 s0, s0, s4
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8S-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8S-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8S-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: or_v4i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -167,6 +238,32 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_or_i32:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s6, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_mov_b32 s4, s0
+; GFX6S-NEXT:    s_or_b32 s0, s2, s3
+; GFX6S-NEXT:    s_mov_b32 s5, s1
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6S-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_or_i32:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s6, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_mov_b32 s4, s0
+; GFX8S-NEXT:    s_or_b32 s0, s2, s3
+; GFX8S-NEXT:    s_mov_b32 s5, s1
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8S-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_or_i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -221,6 +318,34 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: vector_or_i32:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_load_dword s4, s[4:5], 0xd
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s5, s4
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: vector_or_i32:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_load_dword s4, s[4:5], 0x34
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s5, s4
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: vector_or_i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -268,6 +393,30 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_or_literal_i32:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX6S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s6, 0x1869f
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_or_literal_i32:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s6, 0x1869f
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_or_literal_i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -312,6 +461,34 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_or_literal_i64:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; GFX6S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s7, 0xf237b
+; GFX6S-NEXT:    s_or_b32 s5, s6, 0x3039
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_or_literal_i64:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX8S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s7, 0xf237b
+; GFX8S-NEXT:    s_or_b32 s5, s6, 0x3039
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_or_literal_i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -375,6 +552,51 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_or_literal_multi_use_i64:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; GFX6S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x1d
+; GFX6S-NEXT:    s_movk_i32 s8, 0x3039
+; GFX6S-NEXT:    s_mov_b32 s9, 0xf237b
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_add_u32 s0, s4, 0x3039
+; GFX6S-NEXT:    s_addc_u32 s1, s5, 0xf237b
+; GFX6S-NEXT:    s_waitcnt expcnt(0)
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_waitcnt vmcnt(0)
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_or_literal_multi_use_i64:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX8S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x74
+; GFX8S-NEXT:    s_movk_i32 s8, 0x3039
+; GFX8S-NEXT:    s_mov_b32 s9, 0xf237b
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_add_u32 s0, s4, 0x3039
+; GFX8S-NEXT:    s_addc_u32 s1, s5, 0xf237b
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_waitcnt vmcnt(0)
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_or_literal_multi_use_i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 12, @6, KC0[CB0:0-32], KC1[]
@@ -432,6 +654,32 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_or_inline_imm_i64:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; GFX6S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s6, 63
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_or_inline_imm_i64:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX8S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s6, 63
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_or_inline_imm_i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -492,6 +740,49 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_or_inline_imm_multi_use_i64:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s6, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_mov_b32 s4, s0
+; GFX6S-NEXT:    s_or_b32 s0, s2, 63
+; GFX6S-NEXT:    s_mov_b32 s5, s1
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6S-NEXT:    s_add_u32 s0, s8, 63
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6S-NEXT:    s_addc_u32 s1, s9, 0
+; GFX6S-NEXT:    s_waitcnt expcnt(0)
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6S-NEXT:    s_waitcnt vmcnt(0)
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_or_inline_imm_multi_use_i64:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX8S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s6, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_mov_b32 s4, s0
+; GFX8S-NEXT:    s_or_b32 s0, s2, 63
+; GFX8S-NEXT:    s_mov_b32 s5, s1
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8S-NEXT:    s_add_u32 s0, s8, 63
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8S-NEXT:    s_addc_u32 s1, s9, 0
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8S-NEXT:    s_waitcnt vmcnt(0)
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_or_inline_imm_multi_use_i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 9, @6, KC0[CB0:0-32], KC1[]
@@ -545,6 +836,32 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_or_neg_inline_imm_i64:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dword s6, s[4:5], 0x13
+; GFX6S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    v_mov_b32_e32 v1, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s6, -8
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_or_neg_inline_imm_i64:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dword s6, s[4:5], 0x4c
+; GFX8S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    v_mov_b32_e32 v1, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s6, -8
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_or_neg_inline_imm_i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -599,6 +916,32 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: vector_or_literal_i32:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s4, 0xffff
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: vector_or_literal_i32:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s4, 0xffff
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: vector_or_literal_i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -658,6 +1001,32 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: vector_or_inline_immediate_i32:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s4, 4
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: vector_or_inline_immediate_i32:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s4, 4
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: vector_or_inline_immediate_i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -711,6 +1080,36 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_or_i64:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s6, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_mov_b32 s4, s0
+; GFX6S-NEXT:    s_mov_b32 s5, s1
+; GFX6S-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_or_i64:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX8S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s6, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_mov_b32 s4, s0
+; GFX8S-NEXT:    s_mov_b32 s5, s1
+; GFX8S-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_or_i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -774,6 +1173,38 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: vector_or_i64:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX6S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX8S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: vector_or_i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -841,6 +1272,36 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: scalar_vector_or_i64:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: scalar_vector_or_i64:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: scalar_vector_or_i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -903,6 +1364,36 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: vector_or_i64_loadimm:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s5, s5, 0x146f
+; GFX6S-NEXT:    s_or_b32 s4, s4, 0xdf77987f
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64_loadimm:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s5, s5, 0x146f
+; GFX8S-NEXT:    s_or_b32 s4, s4, 0xdf77987f
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: vector_or_i64_loadimm:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -965,6 +1456,34 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: vector_or_i64_imm:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s4, 8
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64_imm:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s4, 8
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: vector_or_i64_imm:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1026,6 +1545,34 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: vector_or_i64_neg_inline_imm:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    v_mov_b32_e32 v1, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s4, -8
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64_neg_inline_imm:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    v_mov_b32_e32 v1, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s4, -8
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: vector_or_i64_neg_inline_imm:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1089,6 +1636,34 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: vector_or_i64_neg_literal:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    v_mov_b32_e32 v1, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s4, 0xffffff38
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64_neg_literal:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    v_mov_b32_e32 v1, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s4, 0xffffff38
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: vector_or_i64_neg_literal:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1140,6 +1715,32 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: trunc_i64_or_to_i32:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dword s6, s[4:5], 0x13
+; GFX6S-NEXT:    s_load_dword s7, s[4:5], 0x1d
+; GFX6S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s2, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_or_b32 s4, s7, s6
+; GFX6S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: trunc_i64_or_to_i32:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dword s6, s[4:5], 0x4c
+; GFX8S-NEXT:    s_load_dword s7, s[4:5], 0x74
+; GFX8S-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s2, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_or_b32 s4, s7, s6
+; GFX8S-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8S-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: trunc_i64_or_to_i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -1211,6 +1812,46 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: or_i1:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s6, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_load_dword s8, s[8:9], 0x0
+; GFX6S-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX6S-NEXT:    s_mov_b32 s4, s0
+; GFX6S-NEXT:    s_mov_b32 s5, s1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    v_mul_f32_e64 v0, 1.0, s8
+; GFX6S-NEXT:    v_mul_f32_e64 v1, 1.0, s2
+; GFX6S-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX6S-NEXT:    v_cmp_le_f32_e32 vcc, 0, v0
+; GFX6S-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6S-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: or_i1:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s6, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_load_dword s8, s[8:9], 0x0
+; GFX8S-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8S-NEXT:    s_mov_b32 s4, s0
+; GFX8S-NEXT:    s_mov_b32 s5, s1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    v_mul_f32_e64 v0, 1.0, s8
+; GFX8S-NEXT:    v_mul_f32_e64 v1, 1.0, s2
+; GFX8S-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX8S-NEXT:    v_cmp_le_f32_e32 vcc, 0, v0
+; GFX8S-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8S-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: or_i1:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -1274,6 +1915,38 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
 ; GFX8-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
+; GFX6S-LABEL: s_or_i1:
+; GFX6S:       ; %bb.0:
+; GFX6S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX6S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6S-NEXT:    s_mov_b32 s6, -1
+; GFX6S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT:    s_cmp_eq_u32 s0, s1
+; GFX6S-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX6S-NEXT:    s_cmp_eq_u32 s2, s3
+; GFX6S-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX6S-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6S-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX6S-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX6S-NEXT:    s_endpgm
+;
+; GFX8S-LABEL: s_or_i1:
+; GFX8S:       ; %bb.0:
+; GFX8S-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8S-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8S-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8S-NEXT:    s_mov_b32 s6, -1
+; GFX8S-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT:    s_cmp_eq_u32 s0, s1
+; GFX8S-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX8S-NEXT:    s_cmp_eq_u32 s2, s3
+; GFX8S-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX8S-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8S-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8S-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX8S-NEXT:    s_endpgm
+;
 ; EG-LABEL: s_or_i1:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 14, @4, KC0[CB0:0-32], KC1[]
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6e361d6e297e..7322e2f239ee8 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -228,6 +228,134 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i32> %y) {
+; R600-LABEL: rotr_v8i32:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X,
+; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W,
+; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z,
+; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT:     BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X,
+; R600-NEXT:     BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W,
+; R600-NEXT:     BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z,
+; R600-NEXT:     BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y,
+; R600-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; R600-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; SI-LABEL: rotr_v8i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s19
+; SI-NEXT:    v_alignbit_b32 v3, s11, s11, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s18
+; SI-NEXT:    v_alignbit_b32 v2, s10, s10, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s17
+; SI-NEXT:    v_alignbit_b32 v1, s9, s9, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s23
+; SI-NEXT:    v_alignbit_b32 v7, s15, s15, v4
+; SI-NEXT:    v_mov_b32_e32 v4, s22
+; SI-NEXT:    v_alignbit_b32 v6, s14, s14, v4
+; SI-NEXT:    v_mov_b32_e32 v4, s21
+; SI-NEXT:    v_alignbit_b32 v5, s13, s13, v4
+; SI-NEXT:    v_mov_b32_e32 v4, s20
+; SI-NEXT:    v_alignbit_b32 v4, s12, s12, v4
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX8-LABEL: rotr_v8i32:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x44
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s18
+; GFX8-NEXT:    v_mov_b32_e32 v4, s17
+; GFX8-NEXT:    v_alignbit_b32 v2, s10, s10, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, s9, s9, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s23
+; GFX8-NEXT:    v_alignbit_b32 v7, s15, s15, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s22
+; GFX8-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, s14, s14, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s21
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    v_alignbit_b32 v5, s13, s13, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s19
+; GFX8-NEXT:    v_alignbit_b32 v4, s12, s12, v4
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
+; GFX8-NEXT:    v_alignbit_b32 v3, s11, s11, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GFX8-NEXT:    v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: rotr_v8i32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x44
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v7, s15, s15, s23
+; GFX10-NEXT:    v_alignbit_b32 v6, s14, s14, s22
+; GFX10-NEXT:    v_alignbit_b32 v5, s13, s13, s21
+; GFX10-NEXT:    v_alignbit_b32 v4, s12, s12, s20
+; GFX10-NEXT:    v_alignbit_b32 v3, s11, s11, s19
+; GFX10-NEXT:    v_alignbit_b32 v2, s10, s10, s18
+; GFX10-NEXT:    v_alignbit_b32 v1, s9, s9, s17
+; GFX10-NEXT:    v_alignbit_b32 v0, s8, s8, s16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: rotr_v8i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x44
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v7, s15, s15, s23
+; GFX11-NEXT:    v_alignbit_b32 v6, s14, s14, s22
+; GFX11-NEXT:    v_alignbit_b32 v5, s13, s13, s21
+; GFX11-NEXT:    v_alignbit_b32 v4, s12, s12, s20
+; GFX11-NEXT:    v_alignbit_b32 v3, s11, s11, s19
+; GFX11-NEXT:    v_alignbit_b32 v2, s10, s10, s18
+; GFX11-NEXT:    v_alignbit_b32 v1, s9, s9, s17
+; GFX11-NEXT:    v_alignbit_b32 v0, s8, s8, s16
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+entry:
+  %tmp0 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %y
+  %tmp1 = shl <8 x i32> %x, %tmp0
+  %tmp2 = lshr <8 x i32> %x, %y
+  %tmp3 = or <8 x i32> %tmp1, %tmp2
+  store <8 x i32> %tmp3, ptr addrspace(1) %in
+  ret void
+}
+
 declare i16 @llvm.fshr.i16(i16, i16, i16)
 
 define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) {
diff --git a/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll b/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll
index 21b7ed4d6b779..3e3eab2aed177 100644
--- a/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll
@@ -528,4 +528,4 @@ define <4 x i64> @shl_v4_maxmin(<4 x i64> %arg0, <4 x i64> noundef %arg1) {
   %min = call <4 x i64> @llvm.umin.i64(<4 x i64> %max,  <4 x i64> splat (i64 63))
   %shl = shl <4 x i64> %arg0, %min
   ret <4 x i64> %shl
-}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
index d496634ae474f..8af4a8de7b266 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
@@ -18,11 +18,11 @@ define <2 x i32> @test_add2x32(ptr %a_ptr, ptr %b_ptr) {
 ; CHECK-LABEL: test_add2x32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v4, v[2:3]
-; CHECK-NEXT:    flat_load_dword v5, v[0:1]
-; CHECK-NEXT:    v_mov_b32_e32 v1, 48
+; CHECK-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v0, v5, v4
+; CHECK-NEXT:    v_or_b32_e32 v1, v5, v7
+; CHECK-NEXT:    v_or_b32_e32 v0, v4, v6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %a = load <2 x i32>, ptr %a_ptr, !range !2, !noundef !{}
   %b = load <2 x i32>, ptr %b_ptr, !range !3, !noundef !{}
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 00bb7b24786f5..3808c73ae7de3 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SIS %s
+; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VIS %s
+
+;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_xor_b64, particularly in the v2i32 case. See
+;; SWDEV-517886.
+;; Also removed the previously unused "GCN" check-prefixes from the test.
 
 define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
 ; SI-LABEL: xor_v2i32:
@@ -21,8 +27,8 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_xor_b32_e32 v0, v2, v0
+; SI-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -40,10 +46,43 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; VI-NEXT:    v_xor_b32_e32 v0, v0, v2
+; VI-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: xor_v2i32:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b64 s[4:5], s[6:7], s[4:5]
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s5
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: xor_v2i32:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
+; VIS-NEXT:    v_mov_b32_e32 v3, s1
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
+
   %a = load <2 x i32>, ptr addrspace(1) %in0
   %b = load <2 x i32>, ptr addrspace(1) %in1
   %result = xor <2 x i32> %a, %b
@@ -97,6 +136,48 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; VI-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: xor_v4i32:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; SIS-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b32 s7, s7, s11
+; SIS-NEXT:    s_xor_b32 s6, s6, s10
+; SIS-NEXT:    s_xor_b32 s5, s5, s9
+; SIS-NEXT:    s_xor_b32 s4, s4, s8
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s5
+; SIS-NEXT:    v_mov_b32_e32 v2, s6
+; SIS-NEXT:    v_mov_b32_e32 v3, s7
+; SIS-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: xor_v4i32:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VIS-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v4, s0
+; VIS-NEXT:    v_mov_b32_e32 v5, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b32 s0, s7, s11
+; VIS-NEXT:    s_xor_b32 s1, s6, s10
+; VIS-NEXT:    s_xor_b32 s2, s5, s9
+; VIS-NEXT:    s_xor_b32 s3, s4, s8
+; VIS-NEXT:    v_mov_b32_e32 v0, s3
+; VIS-NEXT:    v_mov_b32_e32 v1, s2
+; VIS-NEXT:    v_mov_b32_e32 v2, s1
+; VIS-NEXT:    v_mov_b32_e32 v3, s0
+; VIS-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VIS-NEXT:    s_endpgm
   %a = load <4 x i32>, ptr addrspace(1) %in0
   %b = load <4 x i32>, ptr addrspace(1) %in1
   %result = xor <4 x i32> %a, %b
@@ -152,6 +233,47 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: xor_i1:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SIS-NEXT:    s_mov_b32 s7, 0xf000
+; SIS-NEXT:    s_mov_b32 s6, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dword s8, s[2:3], 0x0
+; SIS-NEXT:    s_load_dword s9, s[4:5], 0x0
+; SIS-NEXT:    s_mov_b32 s4, s0
+; SIS-NEXT:    s_mov_b32 s5, s1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    v_cmp_ge_f32_e64 s[0:1], s8, 0
+; SIS-NEXT:    v_cmp_ge_f32_e64 s[2:3], s9, 1.0
+; SIS-NEXT:    v_mov_b32_e32 v0, s9
+; SIS-NEXT:    v_mov_b32_e32 v1, s8
+; SIS-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
+; SIS-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SIS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: xor_i1:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dword s6, s[2:3], 0x0
+; VIS-NEXT:    s_load_dword s4, s[4:5], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    v_cmp_ge_f32_e64 s[0:1], s6, 0
+; VIS-NEXT:    v_cmp_ge_f32_e64 s[2:3], s4, 1.0
+; VIS-NEXT:    v_mov_b32_e32 v2, s4
+; VIS-NEXT:    v_mov_b32_e32 v3, s6
+; VIS-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
+; VIS-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; VIS-NEXT:    flat_store_dword v[0:1], v2
+; VIS-NEXT:    s_endpgm
+
   %a = load float, ptr addrspace(1) %in0
   %b = load float, ptr addrspace(1) %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -206,6 +328,50 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
 ; VI-NEXT:    v_and_b32_e32 v2, 1, v2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: v_xor_i1:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SIS-NEXT:    s_mov_b32 s7, 0xf000
+; SIS-NEXT:    s_mov_b32 s6, -1
+; SIS-NEXT:    s_mov_b32 s14, s6
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_mov_b32 s12, s2
+; SIS-NEXT:    s_mov_b32 s13, s3
+; SIS-NEXT:    s_mov_b32 s15, s7
+; SIS-NEXT:    s_mov_b32 s10, s6
+; SIS-NEXT:    s_mov_b32 s11, s7
+; SIS-NEXT:    buffer_load_ubyte v0, off, s[12:15], 0 glc
+; SIS-NEXT:    s_waitcnt vmcnt(0)
+; SIS-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 glc
+; SIS-NEXT:    s_waitcnt vmcnt(0)
+; SIS-NEXT:    s_mov_b32 s4, s0
+; SIS-NEXT:    s_mov_b32 s5, s1
+; SIS-NEXT:    v_xor_b32_e32 v0, v0, v1
+; SIS-NEXT:    v_and_b32_e32 v0, 1, v0
+; SIS-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: v_xor_i1:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    v_mov_b32_e32 v0, s2
+; VIS-NEXT:    v_mov_b32_e32 v1, s3
+; VIS-NEXT:    v_mov_b32_e32 v2, s4
+; VIS-NEXT:    v_mov_b32_e32 v3, s5
+; VIS-NEXT:    flat_load_ubyte v4, v[0:1] glc
+; VIS-NEXT:    s_waitcnt vmcnt(0)
+; VIS-NEXT:    flat_load_ubyte v2, v[2:3] glc
+; VIS-NEXT:    s_waitcnt vmcnt(0)
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    v_xor_b32_e32 v2, v4, v2
+; VIS-NEXT:    v_and_b32_e32 v2, 1, v2
+; VIS-NEXT:    flat_store_byte v[0:1], v2
+; VIS-NEXT:    s_endpgm
   %a = load volatile i1, ptr addrspace(1) %in0
   %b = load volatile i1, ptr addrspace(1) %in1
   %xor = xor i1 %a, %b
@@ -253,6 +419,36 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; VI-NEXT:    v_xor_b32_e32 v2, v4, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: vector_xor_i32:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dword s6, s[2:3], 0x0
+; SIS-NEXT:    s_load_dword s4, s[4:5], 0x0
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b32 s4, s6, s4
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: vector_xor_i32:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VIS-NEXT:    s_load_dword s3, s[4:5], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b32 s0, s2, s3
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dword v[0:1], v2
+; VIS-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %in0
   %b = load i32, ptr addrspace(1) %in1
   %result = xor i32 %a, %b
@@ -284,6 +480,30 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: scalar_xor_i32:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_mov_b32 s7, 0xf000
+; SIS-NEXT:    s_mov_b32 s6, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_mov_b32 s4, s0
+; SIS-NEXT:    s_xor_b32 s0, s2, s3
+; SIS-NEXT:    s_mov_b32 s5, s1
+; SIS-NEXT:    v_mov_b32_e32 v0, s0
+; SIS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: scalar_xor_i32:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b32 s2, s2, s3
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    v_mov_b32_e32 v2, s2
+; VIS-NEXT:    flat_store_dword v[0:1], v2
+; VIS-NEXT:    s_endpgm
   %result = xor i32 %a, %b
   store i32 %result, ptr addrspace(1) %out
   ret void
@@ -313,6 +533,30 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: scalar_not_i32:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_not_b32 s4, s6
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: scalar_not_i32:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_not_b32 s2, s2
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    v_mov_b32_e32 v2, s2
+; VIS-NEXT:    flat_store_dword v[0:1], v2
+; VIS-NEXT:    s_endpgm
   %result = xor i32 %a, -1
   store i32 %result, ptr addrspace(1) %out
   ret void
@@ -350,6 +594,32 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; VI-NEXT:    v_not_b32_e32 v2, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: vector_not_i32:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_not_b32 s4, s4
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: vector_not_i32:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_not_b32 s0, s2
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dword v[0:1], v2
+; VIS-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %in0
   %b = load i32, ptr addrspace(1) %in1
   %result = xor i32 %a, -1
@@ -399,6 +669,38 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
 ; VI-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: vector_xor_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b64 s[4:5], s[6:7], s[4:5]
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s5
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: vector_xor_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
+; VIS-NEXT:    v_mov_b32_e32 v3, s1
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
   %a = load i64, ptr addrspace(1) %in0
   %b = load i64, ptr addrspace(1) %in1
   %result = xor i64 %a, %b
@@ -434,6 +736,34 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: scalar_xor_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SIS-NEXT:    s_mov_b32 s7, 0xf000
+; SIS-NEXT:    s_mov_b32 s6, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_mov_b32 s4, s0
+; SIS-NEXT:    s_mov_b32 s5, s1
+; SIS-NEXT:    s_xor_b64 s[0:1], s[2:3], s[8:9]
+; SIS-NEXT:    v_mov_b32_e32 v0, s0
+; SIS-NEXT:    v_mov_b32_e32 v1, s1
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: scalar_xor_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
+; VIS-NEXT:    v_mov_b32_e32 v3, s1
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
   %result = xor i64 %a, %b
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -465,6 +795,32 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: scalar_not_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_mov_b32 s7, 0xf000
+; SIS-NEXT:    s_mov_b32 s6, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_mov_b32 s4, s0
+; SIS-NEXT:    s_mov_b32 s5, s1
+; SIS-NEXT:    s_not_b64 s[0:1], s[2:3]
+; SIS-NEXT:    v_mov_b32_e32 v0, s0
+; SIS-NEXT:    v_mov_b32_e32 v1, s1
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: scalar_not_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_not_b64 s[0:1], s[2:3]
+; VIS-NEXT:    v_mov_b32_e32 v3, s1
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
   %result = xor i64 %a, -1
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -504,6 +860,34 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
 ; VI-NEXT:    v_not_b32_e32 v1, v1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: vector_not_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_not_b64 s[4:5], s[4:5]
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s5
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: vector_not_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_not_b64 s[0:1], s[2:3]
+; VIS-NEXT:    v_mov_b32_e32 v3, s1
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
   %a = load i64, ptr addrspace(1) %in0
   %b = load i64, ptr addrspace(1) %in1
   %result = xor i64 %a, -1
@@ -570,6 +954,59 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
 ; VI-NEXT:  .LBB12_4:
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; VI-NEXT:    s_branch .LBB12_2
+;
+; SIS-LABEL: xor_cf:
+; SIS:       ; %bb.0: ; %entry
+; SIS-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SIS-NEXT:    s_mov_b64 s[10:11], 0
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    v_cmp_ne_u64_e64 s[8:9], s[4:5], 0
+; SIS-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; SIS-NEXT:    s_cbranch_vccz .LBB12_4
+; SIS-NEXT:  ; %bb.1: ; %else
+; SIS-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
+; SIS-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_mov_b64 vcc, vcc
+; SIS-NEXT:    s_cbranch_vccnz .LBB12_3
+; SIS-NEXT:  .LBB12_2: ; %if
+; SIS-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
+; SIS-NEXT:  .LBB12_3: ; %endif
+; SIS-NEXT:    v_mov_b32_e32 v0, s8
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    v_mov_b32_e32 v1, s9
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+; SIS-NEXT:  .LBB12_4:
+; SIS-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; SIS-NEXT:    s_branch .LBB12_2
+;
+; VIS-LABEL: xor_cf:
+; VIS:       ; %bb.0: ; %entry
+; VIS-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VIS-NEXT:    s_mov_b64 s[8:9], 0
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; VIS-NEXT:    s_cbranch_scc0 .LBB12_4
+; VIS-NEXT:  ; %bb.1: ; %else
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; VIS-NEXT:    s_cbranch_vccnz .LBB12_3
+; VIS-NEXT:  .LBB12_2: ; %if
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b64 s[2:3], s[4:5], s[6:7]
+; VIS-NEXT:  .LBB12_3: ; %endif
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    v_mov_b32_e32 v2, s2
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    v_mov_b32_e32 v3, s3
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
+; VIS-NEXT:  .LBB12_4:
+; VIS-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; VIS-NEXT:    s_branch .LBB12_2
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -616,6 +1053,34 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: scalar_xor_literal_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b32 s4, s7, 0xf237b
+; SIS-NEXT:    s_xor_b32 s5, s6, 0x3039
+; SIS-NEXT:    v_mov_b32_e32 v0, s5
+; SIS-NEXT:    v_mov_b32_e32 v1, s4
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: scalar_xor_literal_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b32 s1, s1, 0xf237b
+; VIS-NEXT:    s_xor_b32 s0, s0, 0x3039
+; VIS-NEXT:    v_mov_b32_e32 v2, s2
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    v_mov_b32_e32 v3, s3
+; VIS-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VIS-NEXT:    s_endpgm
   %or = xor i64 %a, 4261135838621753
   store i64 %or, ptr addrspace(1) %out
   ret void
@@ -664,6 +1129,49 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: scalar_xor_literal_multi_use_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x13
+; SIS-NEXT:    s_movk_i32 s8, 0x3039
+; SIS-NEXT:    s_mov_b32 s9, 0xf237b
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s5
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_add_u32 s0, s6, 0x3039
+; SIS-NEXT:    s_addc_u32 s1, s7, 0xf237b
+; SIS-NEXT:    s_waitcnt expcnt(0)
+; SIS-NEXT:    v_mov_b32_e32 v0, s0
+; SIS-NEXT:    v_mov_b32_e32 v1, s1
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_waitcnt vmcnt(0)
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: scalar_xor_literal_multi_use_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4c
+; VIS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VIS-NEXT:    s_movk_i32 s6, 0x3039
+; VIS-NEXT:    s_mov_b32 s7, 0xf237b
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
+; VIS-NEXT:    v_mov_b32_e32 v0, s4
+; VIS-NEXT:    v_mov_b32_e32 v3, s1
+; VIS-NEXT:    v_mov_b32_e32 v1, s5
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    s_add_u32 s0, s2, 0x3039
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_addc_u32 s1, s3, 0xf237b
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; VIS-NEXT:    s_waitcnt vmcnt(0)
+; VIS-NEXT:    s_endpgm
   %or = xor i64 %a, 4261135838621753
   store i64 %or, ptr addrspace(1) %out
 
@@ -698,6 +1206,32 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: scalar_xor_inline_imm_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b32 s4, s6, 63
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s7
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: scalar_xor_inline_imm_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b32 s0, s0, 63
+; VIS-NEXT:    v_mov_b32_e32 v2, s2
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v3, s3
+; VIS-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VIS-NEXT:    s_endpgm
   %or = xor i64 %a, 63
   store i64 %or, ptr addrspace(1) %out
   ret void
@@ -729,6 +1263,33 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: scalar_xor_neg_inline_imm_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b64 s[4:5], s[6:7], -8
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s5
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: scalar_xor_neg_inline_imm_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b64 s[0:1], s[0:1], -8
+; VIS-NEXT:    v_mov_b32_e32 v0, s2
+; VIS-NEXT:    v_mov_b32_e32 v3, s1
+; VIS-NEXT:    v_mov_b32_e32 v1, s3
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
+
   %or = xor i64 %a, -8
   store i64 %or, ptr addrspace(1) %out
   ret void
@@ -768,6 +1329,34 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out,
 ; VI-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: vector_xor_i64_neg_inline_imm:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b64 s[4:5], s[4:5], -8
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s5
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: vector_xor_i64_neg_inline_imm:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b64 s[0:1], s[2:3], -8
+; VIS-NEXT:    v_mov_b32_e32 v3, s1
+; VIS-NEXT:    v_mov_b32_e32 v2, s0
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
   %loada = load i64, ptr addrspace(1) %a, align 8
   %or = xor i64 %loada, -8
   store i64 %or, ptr addrspace(1) %out
@@ -808,10 +1397,39 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_xor_b32_e32 v0, 0xdf77987f, v0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; SIS-LABEL: vector_xor_literal_i64:
+; SIS:       ; %bb.0:
+; SIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SIS-NEXT:    s_mov_b32 s3, 0xf000
+; SIS-NEXT:    s_mov_b32 s2, -1
+; SIS-NEXT:    s_waitcnt lgkmcnt(0)
+; SIS-NEXT:    s_xor_b32 s5, s5, 0x146f
+; SIS-NEXT:    s_xor_b32 s4, s4, 0xdf77987f
+; SIS-NEXT:    v_mov_b32_e32 v0, s4
+; SIS-NEXT:    v_mov_b32_e32 v1, s5
+; SIS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT:    s_endpgm
+;
+; VIS-LABEL: vector_xor_literal_i64:
+; VIS:       ; %bb.0:
+; VIS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT:    v_mov_b32_e32 v0, s0
+; VIS-NEXT:    v_mov_b32_e32 v1, s1
+; VIS-NEXT:    s_waitcnt lgkmcnt(0)
+; VIS-NEXT:    s_xor_b32 s0, s3, 0x146f
+; VIS-NEXT:    s_xor_b32 s1, s2, 0xdf77987f
+; VIS-NEXT:    v_mov_b32_e32 v2, s1
+; VIS-NEXT:    v_mov_b32_e32 v3, s0
+; VIS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT:    s_endpgm
+
   %loada = load i64, ptr addrspace(1) %a, align 8
   %or = xor i64 %loada, 22470723082367
   store i64 %or, ptr addrspace(1) %out
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}