[llvm] [AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions (PR #140694)
Chris Jackson via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 25 11:20:45 PDT 2025
https://github.com/chrisjbris updated https://github.com/llvm/llvm-project/pull/140694
>From f0c57fd46655333bdec0c2d971e232638ca03e63 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 20 May 2025 05:14:36 -0500
Subject: [PATCH 01/15] [AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions
to make use of 64-bit wide instructions
Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these
causes a number of test regressions, so extra work in the combiner and
Tablegen patterns was necessary.
- Use custom for v2i32 rotr instead of additional patterns. Modify
PerformOrCombine() to remove some identity or operations
- Fix rotr regression by adding lowerRotr() on the legalizer codepath.
- Add test case to rotr.ll
- Extend performFNEGCombine() for the SELECT case.
- Modify performSelectCombine() and foldFreeOpFromSelect to prevent the
performFNEGCombine() changes from being unwound.
- Add cases to or.ll and xor.ll to demonstrate the generation of the
s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously
this was inhibited by "-amdgpu-scalarize-global-loads=false".
- Fix shl/srl64_reduce regression by performing the scalarisation
previously performewd by the vector legaliser in the combiner.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 146 +++-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 95 ++-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 47 +-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 15 +
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 8 +-
llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 24 +-
llvm/test/CodeGen/AMDGPU/bfi_int.ll | 4 +-
.../AMDGPU/copysign-simplify-demanded-bits.ll | 4 +-
.../AMDGPU/dag-preserve-disjoint-flag.ll | 36 +-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 16 +-
llvm/test/CodeGen/AMDGPU/fshr.ll | 188 ++---
llvm/test/CodeGen/AMDGPU/or.ll | 677 +++++++++++++++++-
llvm/test/CodeGen/AMDGPU/rotr.ll | 128 ++++
.../CodeGen/AMDGPU/vector_range_metadata.ll | 8 +-
llvm/test/CodeGen/AMDGPU/xor.ll | 630 +++++++++++++++-
16 files changed, 1807 insertions(+), 220 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3db2b3bff2d36..facb183b89531 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4025,9 +4025,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
/// binary operation \p Opc to it with the corresponding constant operands.
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
- DAGCombinerInfo &DCI, const SDLoc &SL,
- unsigned Opc, SDValue LHS,
- uint32_t ValLo, uint32_t ValHi) const {
+ DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const {
SelectionDAG &DAG = DCI.DAG;
SDValue Lo, Hi;
std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4056,6 +4055,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
+ // When the shl64_reduce optimisation code is passed through vector
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
+ // resulted in the AND instructions no longer being elided, as mentioned
+ // below. The following code should make sure this takes place.
+ if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue VAND = RHS.getOperand(0);
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
+ uint64_t AndIndex = RHS->getConstantOperandVal(1);
+ if (VAND->getOpcode() == ISD::AND && CRRHS) {
+ SDValue LHSAND = VAND.getOperand(0);
+ SDValue RHSAND = VAND.getOperand(1);
+ if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
+ // Part of shlcombine is to optimise for the case where its possible
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
+ // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
+ // '&' is then elided by ISel. The vector code for this was being
+ // completely scalarised by the vector legalizer, but now v2i32 is
+ // made legal the vector legaliser only partially scalarises the
+ // vector operations and the and was not elided. This check enables us
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
+ // the and instruction.
+ ConstantSDNode *CANDL =
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
+ ConstantSDNode *CANDR =
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
+ RHSAND->getConstantOperandVal(1) == 0x1f) {
+ // Get the non-const AND operands and produce scalar AND
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+ LHSAND, Zero);
+ SDValue Hi =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
+ SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
+ SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
+ SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+ if (AndIndex == 0 || AndIndex == 1)
+ return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
+ }
+ }
+ }
+ }
+ }
+
unsigned RHSVal;
if (CRHS) {
RHSVal = CRHS->getZExtValue();
@@ -4097,8 +4143,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
if (VT.getScalarType() != MVT::i64)
return SDValue();
- // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
-
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
// common case, splitting this into a move and a 32-bit shift is faster and
// the same code size.
@@ -4189,6 +4233,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
SDLoc SL(N);
unsigned RHSVal;
+ // When the shl64_reduce optimisation code is passed through vector
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
+ // resulted in the AND instructions no longer being elided, as mentioned
+ // below. The following code should make sure this takes place.
+ if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue VAND = RHS.getOperand(0);
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
+ uint64_t AndIndex = RHS->getConstantOperandVal(1);
+ if (VAND->getOpcode() == ISD::AND && CRRHS) {
+ SDValue LHSAND = VAND.getOperand(0);
+ SDValue RHSAND = VAND.getOperand(1);
+ if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
+ // Part of srlcombine is to optimise for the case where its possible
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
+ // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
+ // '&' is then elided by ISel. The vector code for this was being
+ // completely scalarised by the vector legalizer, but now v2i32 is
+ // made legal the vector legaliser only partially scalarises the
+ // vector operations and the and was not elided. This check enables us
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
+ // the and instruction.
+ ConstantSDNode *CANDL =
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
+ ConstantSDNode *CANDR =
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
+ RHSAND->getConstantOperandVal(1) == 0x1f) {
+ // Get the non-const AND operands and produce scalar AND
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+ LHSAND, Zero);
+ SDValue Hi =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
+ SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
+ SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
+ SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+ if (AndIndex == 0 || AndIndex == 1)
+ return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
+ }
+ }
+ }
+ }
+ }
+
if (CRHS) {
RHSVal = CRHS->getZExtValue();
@@ -4701,8 +4792,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
return SDValue();
- return distributeOpThroughSelect(DCI, LHS.getOpcode(),
- SDLoc(N), Cond, LHS, RHS);
+ // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
+ // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
+ // out in this case. For now I've made the logic as specific to the case as
+ // possible, hopefully this can be relaxed in future.
+ if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
+ SDValue LHSB = LHS.getOperand(0);
+ SDValue RHSB = RHS.getOperand(0);
+ if (LHSB.getOpcode() == ISD::BITCAST &&
+ RHSB->getOpcode() == ISD::BITCAST) {
+ EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
+ EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
+ if (LHSB.getValueType() == MVT::f32 &&
+ RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
+ RHSBOpTy == MVT::i32)
+ return SDValue();
+ }
+ }
+
+ return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
+ RHS);
}
bool Inv = false;
@@ -4755,8 +4864,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
if (Inv)
std::swap(NewLHS, NewRHS);
- SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
- Cond, NewLHS, NewRHS);
+ SDValue NewSelect =
+ DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
DCI.AddToWorklist(NewSelect.getNode());
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
}
@@ -5094,8 +5203,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
}
case ISD::SELECT: {
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
+ // This combine became necessary recently to prevent a regression in
+ // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
+ // Specifically, additional instructions were added to the final codegen.
+ // When adding this combine a case was added to performFNEGCombine to
+ // prevent this combine from being undone under certain conditions.
// TODO: Invert conditions of foldFreeOpFromSelect
- return SDValue();
+ SDValue Cond = N0.getOperand(0);
+ SDValue LHS = N0.getOperand(1);
+ SDValue RHS = N0.getOperand(2);
+ EVT LHVT = LHS.getValueType();
+ EVT RHVT = RHS.getValueType();
+ // The regression was limited to i32 v2/i32.
+ if (RHVT != MVT::i32 && LHVT != MVT::i32)
+ return SDValue();
+
+ SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
+ SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
+ SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
+ return Op;
}
case ISD::BITCAST: {
SDLoc SL(N);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 17c7fb7bb1533..80bccf5274829 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -438,6 +438,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
}
+ setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
+ // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
+ // instead lower to cndmask in SITargetLowering::LowerSELECT().
+ setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
+ // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
+ // alignbit.
+ setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
+
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);
@@ -6041,6 +6049,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}
+// Enable lowering of ROTR for vxi32 types. This is a workaround for a
+// regression whereby extra unnecessary instructions were added to codegen
+// for rotr operations, casued by legalising v2i32 or. This resulted in extra
+// instructions to extract the result from the vector.
+SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
+ [[maybe_unused]] EVT VT = Op.getValueType();
+
+ assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
+ VT == MVT::v16i32) &&
+ "Unexpected ValueType.");
+
+ return DAG.UnrollVectorOp(Op.getNode());
+}
+
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
// wider vector type is legal.
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6232,6 +6254,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerGET_FPENV(Op, DAG);
case ISD::SET_FPENV:
return lowerSET_FPENV(Op, DAG);
+ case ISD::ROTR:
+ return lowerROTR(Op, DAG);
}
return SDValue();
}
@@ -13136,6 +13160,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
}
+ // Detect identity v2i32 OR and replace with identity source node.
+ // Specifically an Or that has operands constructed from the same source node
+ // via extract_vector_elt and build_vector. I.E.
+ // v2i32 or(
+ // v2i32 build_vector(
+ // i32 extract_elt(%IdentitySrc, 0),
+ // i32 0
+ // ),
+ // v2i32 build_vector(
+ // i32 0,
+ // i32 extract_elt(%IdentitySrc, 1)
+ // ) )
+ // =>
+ // v2i32 %IdentitySrc
+
+ if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
+ RHS->getOpcode() == ISD::BUILD_VECTOR) {
+
+ ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+ ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
+
+ // Test for and normalise build vectors.
+ if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
+
+ // Get the extract_vector_element operands.
+ SDValue LEVE = LHS->getOperand(0);
+ SDValue REVE = RHS->getOperand(1);
+
+ if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ // Check that different elements from the same vector are
+ // extracted.
+ if (LEVE->getOperand(0) == REVE->getOperand(0) &&
+ LEVE->getOperand(1) != REVE->getOperand(1)) {
+ SDValue IdentitySrc = LEVE.getOperand(0);
+ return IdentitySrc;
+ }
+ }
+ }
+ }
+
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
return SDValue();
@@ -13180,13 +13245,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
return RV;
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
+
+ const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
+ const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ SDValue LHS_0 = LHS.getOperand(0);
+ SDValue LHS_1 = LHS.getOperand(1);
+
+ if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
+ CRHS0->getAPIntValue().isSignMask() &&
+ shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
+ CRHS1->getAPIntValue().isSignMask() &&
+ shouldFoldFNegIntoSrc(N, LHS_1)) {
+
+ SDLoc DL(N);
+ SDValue CastLHS =
+ DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
+ SDValue CastRHS =
+ DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
+ SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
+ SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
+ SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
+ LHS->getOperand(0), FNegLHS, FNegRHS);
+ return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
+ }
+ }
+
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
- SelectionDAG &DAG = DCI.DAG;
- EVT VT = N->getValueType(0);
if (CRHS && VT == MVT::i64) {
if (SDValue Split =
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 89fb12b52c3e6..17bf1f19cae6d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -442,6 +442,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7b45023dd3c77..1b93046b5d3e6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1869,7 +1869,6 @@ def : GCNPat <
>;
}
-
/********** ================================ **********/
/********** Floating point absolute/negative **********/
/********** ================================ **********/
@@ -2423,9 +2422,9 @@ def : AMDGPUPatIgnoreCopies <
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
>;
-// 64-bit version
+foreach vt = [i64, v2i32] in {
def : AMDGPUPatIgnoreCopies <
- (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+ (DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
(REG_SEQUENCE VReg_64,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
@@ -2434,6 +2433,7 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
+}
def : AMDGPUPat <
(fcopysign f32:$src0, f32:$src1),
@@ -2477,30 +2477,25 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- (EXTRACT_SUBREG $src1, lo16),
- /* clamp */ 0, /* op_sel */ 0)
->;
-
-def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
- (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
- (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
- 0, /* src1_modifiers */
- (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
- 0, /* src2_modifiers */
- (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)),
- /* clamp */ 0, /* op_sel */ 0)>;
+ def : GCNPat<(rotr i32:$src0, i32:$src1),
+ (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
+ /* src1_modifiers */ 0, $src0,
+ /* src2_modifiers */ 0, (EXTRACT_SUBREG $src1, lo16),
+ /* clamp */ 0, /* op_sel */ 0)>;
-def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
- (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src1,
- /* src2_modifiers */ 0,
- (EXTRACT_SUBREG VGPR_32:$src2, lo16),
- /* clamp */ 0, /* op_sel */ 0)>;
+ def : GCNPat<
+ (i32(trunc(srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+ (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
+ (i32(EXTRACT_SUBREG(i64 $src0), sub1)), 0, /* src1_modifiers */
+ (i32(EXTRACT_SUBREG(i64 $src0), sub0)), 0, /* src2_modifiers */
+ (i16(EXTRACT_SUBREG VGPR_32:$src1, lo16)),
+ /* clamp */ 0, /* op_sel */ 0)>;
+
+ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+ (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
+ /* src1_modifiers */ 0, $src1,
+ /* src2_modifiers */ 0, (EXTRACT_SUBREG VGPR_32:$src2, lo16),
+ /* clamp */ 0, /* op_sel */ 0)>;
} // end True16Predicate = UseRealTrue16Insts
let True16Predicate = UseFakeTrue16Insts in {
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 376c6eb135b1e..c3be83ecdffdd 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1792,6 +1792,21 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;
+def : GCNPat <
+ (v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
+ (S_AND_B64 SReg_64:$x, SReg_64:$y)
+>;
+
+def : GCNPat <
+ (v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
+ (S_OR_B64 SReg_64:$x, SReg_64:$y)
+>;
+
+def : GCNPat <
+ (v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
+ (S_XOR_B64 SReg_64:$x, SReg_64:$y)
+>;
+
// Same as a 32-bit inreg
def : GCNPat<
(i32 (UniformUnaryFrag<sext> i16:$src)),
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..efa9c465f794e 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -954,9 +954,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
-class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
+class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
GCNPat<
- (DivergentBinFrag<Op> i64:$src0, i64:$src1),
+ (DivergentBinFrag<Op> vt:$src0, vt:$src1),
(REG_SEQUENCE VReg_64,
(Inst
(i32 (EXTRACT_SUBREG $src0, sub0)),
@@ -973,6 +973,10 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
+def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
+def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
+def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;
+
// mul24 w/ 64 bit output.
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
(i64 (Op i32:$src0, i32:$src1)),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..ca8f7736f6093 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -151,25 +151,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
; GFX-950: ; %bb.0:
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
+; GFX-950-NEXT: v_and_b32_e32 v4, 1, v6
+; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
-; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
-; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
+; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5]
+; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1]
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
+; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
-; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
+; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
+; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
-; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
-; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
+; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3]
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
-; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0
+; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
; GFX-950-NEXT: ; return to shader part epilog
%res = fptrunc <2 x double> %src to <2 x bfloat>
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index b372dec383344..987555fbaaafb 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -582,15 +582,15 @@ define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %
; GFX7-LABEL: v_bitselect_v2i32_pat1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
+; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bitselect_v2i32_pat1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
+; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bitselect_v2i32_pat1:
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 021104114d796..f5227eed458d6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -31,8 +31,8 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
; GFX9-LABEL: test_pown_reduced_fast_v2f16_known_odd:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0
@@ -126,8 +126,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v6, 1, v5
+; GFX9-NEXT: v_or_b32_e32 v4, 1, v4
; GFX9-NEXT: v_cvt_f64_i32_e32 v[4:5], v4
; GFX9-NEXT: v_cvt_f64_i32_e32 v[6:7], v6
; GFX9-NEXT: s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
index d63a36c4b2958..7e2e8b577e085 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
@@ -28,12 +28,15 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
- ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_1]]
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
@@ -64,10 +67,23 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
- ; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY5]], killed [[COPY4]], implicit $exec
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY7]], killed [[COPY6]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; CHECK-NEXT: $vgpr1 = COPY [[COPY9]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
%result = or disjoint <2 x i32> %a, %b
ret <2 x i32> %result
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 1b092b283290a..ea662f299e76a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1645,12 +1645,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_cselect_b32 s1, s1, s3
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v0, -v0, -v1, vcc
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -1669,10 +1669,10 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b32 s1, s1, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, -v0, -v1, vcc
; GFX9-NEXT: s_cselect_b32 s0, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
@@ -1683,17 +1683,17 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x18
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s1
; GFX11-NEXT: s_bitcmp1_b32 s6, 0
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -s3, -v0, vcc_lo
; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo
; GFX11-NEXT: s_cselect_b32 s1, s1, s3
; GFX11-NEXT: s_cselect_b32 s0, s0, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, -v0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, s1, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 4a79096442c96..7afd99ddb0ef6 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -2010,61 +2010,61 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; SI-LABEL: v_fshr_v2i24:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; SI-NEXT: s_mov_b32 s4, 0xaaaaaab
-; SI-NEXT: v_mul_hi_u32 v6, v6, s4
-; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
+; SI-NEXT: v_mul_hi_u32 v6, v4, s4
+; SI-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
-; SI-NEXT: v_mul_hi_u32 v6, v7, s4
+; SI-NEXT: v_mul_hi_u32 v6, v5, s4
; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4
; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v6
-; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; SI-NEXT: v_mul_u32_u24_e32 v2, 24, v6
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2
+; SI-NEXT: v_alignbit_b32 v1, v1, v3, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v2i24:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
+; VI-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; VI-NEXT: s_mov_b32 s4, 0xaaaaaab
-; VI-NEXT: v_mul_hi_u32 v6, v6, s4
-; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
+; VI-NEXT: v_mul_hi_u32 v6, v4, s4
+; VI-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6
-; VI-NEXT: v_mul_hi_u32 v6, v7, s4
+; VI-NEXT: v_mul_hi_u32 v6, v5, s4
; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4
; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v6
-; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3
-; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3
-; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; VI-NEXT: v_mul_u32_u24_e32 v2, 24, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v5, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2
+; VI-NEXT: v_alignbit_b32 v1, v1, v3, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v2i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab
-; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5
+; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4
+; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6
-; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4
+; GFX9-NEXT: v_mul_hi_u32 v6, v5, s4
; GFX9-NEXT: v_add_u32_e32 v4, 8, v4
; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v6
-; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3
-; GFX9-NEXT: v_add_u32_e32 v3, 8, v3
-; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; GFX9-NEXT: v_mul_u32_u24_e32 v2, 24, v6
+; GFX9-NEXT: v_sub_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
+; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v2i24:
@@ -2075,12 +2075,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX10-LABEL: v_fshr_v2i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
+; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5
; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7
; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6
@@ -2091,109 +2091,29 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: v_fshr_v2i24:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6
-; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
-; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_fshr_v2i24:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX11-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6
-; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
-; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: v_fshr_v2i24:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6
-; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
-; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l
-; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: v_fshr_v2i24:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX12-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6
-; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
-; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
-; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5
-; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_fshr_v2i24:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6
+; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6
+; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4
+; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
ret <2 x i24> %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 1abd2e6b60f2f..26751b289a385 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -1,8 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6S %s
+; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8S %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_or_b64, particularly in the v2i32 case. See SWDEV-517886.
+;; Also removed the previously unused "GCN" check-prefixes from the test.
+
define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX6-LABEL: or_v2i32:
; GFX6: ; %bb.0:
@@ -18,8 +23,8 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
@@ -37,11 +42,39 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: or_v2i32:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: v_mov_b32_e32 v1, s5
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: or_v2i32:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: v_mov_b32_e32 v1, s5
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: or_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -112,6 +145,44 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: or_v4i32:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX6S-NEXT: s_mov_b32 s11, 0xf000
+; GFX6S-NEXT: s_mov_b32 s10, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s3, s3, s7
+; GFX6S-NEXT: s_or_b32 s2, s2, s6
+; GFX6S-NEXT: s_or_b32 s1, s1, s5
+; GFX6S-NEXT: s_or_b32 s0, s0, s4
+; GFX6S-NEXT: v_mov_b32_e32 v0, s0
+; GFX6S-NEXT: v_mov_b32_e32 v1, s1
+; GFX6S-NEXT: v_mov_b32_e32 v2, s2
+; GFX6S-NEXT: v_mov_b32_e32 v3, s3
+; GFX6S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: or_v4i32:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX8S-NEXT: s_mov_b32 s11, 0xf000
+; GFX8S-NEXT: s_mov_b32 s10, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s3, s3, s7
+; GFX8S-NEXT: s_or_b32 s2, s2, s6
+; GFX8S-NEXT: s_or_b32 s1, s1, s5
+; GFX8S-NEXT: s_or_b32 s0, s0, s4
+; GFX8S-NEXT: v_mov_b32_e32 v0, s0
+; GFX8S-NEXT: v_mov_b32_e32 v1, s1
+; GFX8S-NEXT: v_mov_b32_e32 v2, s2
+; GFX8S-NEXT: v_mov_b32_e32 v3, s3
+; GFX8S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: or_v4i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -167,6 +238,32 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_or_i32:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_mov_b32 s7, 0xf000
+; GFX6S-NEXT: s_mov_b32 s6, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_mov_b32 s4, s0
+; GFX6S-NEXT: s_or_b32 s0, s2, s3
+; GFX6S-NEXT: s_mov_b32 s5, s1
+; GFX6S-NEXT: v_mov_b32_e32 v0, s0
+; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_or_i32:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_mov_b32 s7, 0xf000
+; GFX8S-NEXT: s_mov_b32 s6, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_mov_b32 s4, s0
+; GFX8S-NEXT: s_or_b32 s0, s2, s3
+; GFX8S-NEXT: s_mov_b32 s5, s1
+; GFX8S-NEXT: v_mov_b32_e32 v0, s0
+; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_or_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -221,6 +318,34 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: vector_or_i32:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_load_dword s4, s[4:5], 0xd
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s5, s4
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: vector_or_i32:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_load_dword s4, s[4:5], 0x34
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s5, s4
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: vector_or_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -268,6 +393,30 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_or_literal_i32:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dword s6, s[4:5], 0xb
+; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s6, 0x1869f
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_or_literal_i32:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s6, 0x1869f
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_or_literal_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -312,6 +461,34 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_or_literal_i64:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s7, 0xf237b
+; GFX6S-NEXT: s_or_b32 s5, s6, 0x3039
+; GFX6S-NEXT: v_mov_b32_e32 v0, s5
+; GFX6S-NEXT: v_mov_b32_e32 v1, s4
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_or_literal_i64:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s7, 0xf237b
+; GFX8S-NEXT: s_or_b32 s5, s6, 0x3039
+; GFX8S-NEXT: v_mov_b32_e32 v0, s5
+; GFX8S-NEXT: v_mov_b32_e32 v1, s4
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_or_literal_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -375,6 +552,51 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_or_literal_multi_use_i64:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d
+; GFX6S-NEXT: s_movk_i32 s8, 0x3039
+; GFX6S-NEXT: s_mov_b32 s9, 0xf237b
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6S-NEXT: v_mov_b32_e32 v0, s6
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: v_mov_b32_e32 v1, s7
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_add_u32 s0, s4, 0x3039
+; GFX6S-NEXT: s_addc_u32 s1, s5, 0xf237b
+; GFX6S-NEXT: s_waitcnt expcnt(0)
+; GFX6S-NEXT: v_mov_b32_e32 v0, s0
+; GFX6S-NEXT: v_mov_b32_e32 v1, s1
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_waitcnt vmcnt(0)
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_or_literal_multi_use_i64:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74
+; GFX8S-NEXT: s_movk_i32 s8, 0x3039
+; GFX8S-NEXT: s_mov_b32 s9, 0xf237b
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8S-NEXT: v_mov_b32_e32 v0, s6
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: v_mov_b32_e32 v1, s7
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_add_u32 s0, s4, 0x3039
+; GFX8S-NEXT: s_addc_u32 s1, s5, 0xf237b
+; GFX8S-NEXT: v_mov_b32_e32 v0, s0
+; GFX8S-NEXT: v_mov_b32_e32 v1, s1
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_waitcnt vmcnt(0)
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_or_literal_multi_use_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[]
@@ -432,6 +654,32 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_or_inline_imm_i64:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s6, 63
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: v_mov_b32_e32 v1, s7
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_or_inline_imm_i64:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s6, 63
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: v_mov_b32_e32 v1, s7
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_or_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -492,6 +740,49 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_or_inline_imm_multi_use_i64:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6S-NEXT: s_mov_b32 s7, 0xf000
+; GFX6S-NEXT: s_mov_b32 s6, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_mov_b32 s4, s0
+; GFX6S-NEXT: s_or_b32 s0, s2, 63
+; GFX6S-NEXT: s_mov_b32 s5, s1
+; GFX6S-NEXT: v_mov_b32_e32 v0, s0
+; GFX6S-NEXT: v_mov_b32_e32 v1, s3
+; GFX6S-NEXT: s_add_u32 s0, s8, 63
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6S-NEXT: s_addc_u32 s1, s9, 0
+; GFX6S-NEXT: s_waitcnt expcnt(0)
+; GFX6S-NEXT: v_mov_b32_e32 v0, s0
+; GFX6S-NEXT: v_mov_b32_e32 v1, s1
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6S-NEXT: s_waitcnt vmcnt(0)
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_or_inline_imm_multi_use_i64:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX8S-NEXT: s_mov_b32 s7, 0xf000
+; GFX8S-NEXT: s_mov_b32 s6, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_mov_b32 s4, s0
+; GFX8S-NEXT: s_or_b32 s0, s2, 63
+; GFX8S-NEXT: s_mov_b32 s5, s1
+; GFX8S-NEXT: v_mov_b32_e32 v0, s0
+; GFX8S-NEXT: v_mov_b32_e32 v1, s3
+; GFX8S-NEXT: s_add_u32 s0, s8, 63
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8S-NEXT: s_addc_u32 s1, s9, 0
+; GFX8S-NEXT: v_mov_b32_e32 v0, s0
+; GFX8S-NEXT: v_mov_b32_e32 v1, s1
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8S-NEXT: s_waitcnt vmcnt(0)
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_or_inline_imm_multi_use_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[]
@@ -545,6 +836,32 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_or_neg_inline_imm_i64:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13
+; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: v_mov_b32_e32 v1, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s6, -8
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_or_neg_inline_imm_i64:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c
+; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: v_mov_b32_e32 v1, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s6, -8
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_or_neg_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -599,6 +916,32 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: vector_or_literal_i32:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s4, 0xffff
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: vector_or_literal_i32:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s4, 0xffff
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: vector_or_literal_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -658,6 +1001,32 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: vector_or_inline_immediate_i32:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s4, 4
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: vector_or_inline_immediate_i32:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s4, 4
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: vector_or_inline_immediate_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -711,6 +1080,36 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_or_i64:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6S-NEXT: s_mov_b32 s7, 0xf000
+; GFX6S-NEXT: s_mov_b32 s6, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_mov_b32 s4, s0
+; GFX6S-NEXT: s_mov_b32 s5, s1
+; GFX6S-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6S-NEXT: v_mov_b32_e32 v0, s0
+; GFX6S-NEXT: v_mov_b32_e32 v1, s1
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_or_i64:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX8S-NEXT: s_mov_b32 s7, 0xf000
+; GFX8S-NEXT: s_mov_b32 s6, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_mov_b32 s4, s0
+; GFX8S-NEXT: s_mov_b32 s5, s1
+; GFX8S-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8S-NEXT: v_mov_b32_e32 v0, s0
+; GFX8S-NEXT: v_mov_b32_e32 v1, s1
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -774,6 +1173,38 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: vector_or_i64:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: v_mov_b32_e32 v1, s5
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: v_mov_b32_e32 v1, s5
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: vector_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -841,6 +1272,36 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: scalar_vector_or_i64:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: v_mov_b32_e32 v1, s5
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: scalar_vector_or_i64:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: v_mov_b32_e32 v1, s5
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: scalar_vector_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -903,6 +1364,36 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: vector_or_i64_loadimm:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s5, s5, 0x146f
+; GFX6S-NEXT: s_or_b32 s4, s4, 0xdf77987f
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: v_mov_b32_e32 v1, s5
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64_loadimm:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s5, s5, 0x146f
+; GFX8S-NEXT: s_or_b32 s4, s4, 0xdf77987f
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: v_mov_b32_e32 v1, s5
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: vector_or_i64_loadimm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -965,6 +1456,34 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: vector_or_i64_imm:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s4, 8
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: v_mov_b32_e32 v1, s5
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64_imm:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s4, 8
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: v_mov_b32_e32 v1, s5
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: vector_or_i64_imm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1026,6 +1545,34 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: vector_or_i64_neg_inline_imm:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: v_mov_b32_e32 v1, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s4, -8
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64_neg_inline_imm:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: v_mov_b32_e32 v1, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s4, -8
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: vector_or_i64_neg_inline_imm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1089,6 +1636,34 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: vector_or_i64_neg_literal:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: v_mov_b32_e32 v1, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s4, 0xffffff38
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: vector_or_i64_neg_literal:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: v_mov_b32_e32 v1, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s4, 0xffffff38
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: vector_or_i64_neg_literal:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1140,6 +1715,32 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: trunc_i64_or_to_i32:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13
+; GFX6S-NEXT: s_load_dword s7, s[4:5], 0x1d
+; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6S-NEXT: s_mov_b32 s3, 0xf000
+; GFX6S-NEXT: s_mov_b32 s2, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_or_b32 s4, s7, s6
+; GFX6S-NEXT: v_mov_b32_e32 v0, s4
+; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: trunc_i64_or_to_i32:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c
+; GFX8S-NEXT: s_load_dword s7, s[4:5], 0x74
+; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8S-NEXT: s_mov_b32 s3, 0xf000
+; GFX8S-NEXT: s_mov_b32 s2, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_or_b32 s4, s7, s6
+; GFX8S-NEXT: v_mov_b32_e32 v0, s4
+; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: trunc_i64_or_to_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -1211,6 +1812,46 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: or_i1:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6S-NEXT: s_mov_b32 s7, 0xf000
+; GFX6S-NEXT: s_mov_b32 s6, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_load_dword s8, s[8:9], 0x0
+; GFX6S-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX6S-NEXT: s_mov_b32 s4, s0
+; GFX6S-NEXT: s_mov_b32 s5, s1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: v_mul_f32_e64 v0, 1.0, s8
+; GFX6S-NEXT: v_mul_f32_e64 v1, 1.0, s2
+; GFX6S-NEXT: v_max_f32_e32 v0, v1, v0
+; GFX6S-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
+; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: or_i1:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8S-NEXT: s_mov_b32 s7, 0xf000
+; GFX8S-NEXT: s_mov_b32 s6, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_load_dword s8, s[8:9], 0x0
+; GFX8S-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8S-NEXT: s_mov_b32 s4, s0
+; GFX8S-NEXT: s_mov_b32 s5, s1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: v_mul_f32_e64 v0, 1.0, s8
+; GFX8S-NEXT: v_mul_f32_e64 v1, 1.0, s2
+; GFX8S-NEXT: v_max_f32_e32 v0, v1, v0
+; GFX8S-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
+; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: or_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -1274,6 +1915,38 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
+; GFX6S-LABEL: s_or_i1:
+; GFX6S: ; %bb.0:
+; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6S-NEXT: s_mov_b32 s7, 0xf000
+; GFX6S-NEXT: s_mov_b32 s6, -1
+; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6S-NEXT: s_cmp_eq_u32 s0, s1
+; GFX6S-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX6S-NEXT: s_cmp_eq_u32 s2, s3
+; GFX6S-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX6S-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX6S-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX6S-NEXT: s_endpgm
+;
+; GFX8S-LABEL: s_or_i1:
+; GFX8S: ; %bb.0:
+; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8S-NEXT: s_mov_b32 s7, 0xf000
+; GFX8S-NEXT: s_mov_b32 s6, -1
+; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8S-NEXT: s_cmp_eq_u32 s0, s1
+; GFX8S-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX8S-NEXT: s_cmp_eq_u32 s2, s3
+; GFX8S-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX8S-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8S-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX8S-NEXT: s_endpgm
+;
; EG-LABEL: s_or_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6e361d6e297e..7322e2f239ee8 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -228,6 +228,134 @@ entry:
ret void
}
+define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i32> %y) {
+; R600-LABEL: rotr_v8i32:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X,
+; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W,
+; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z,
+; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X,
+; R600-NEXT: BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W,
+; R600-NEXT: BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z,
+; R600-NEXT: BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y,
+; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; SI-LABEL: rotr_v8i32:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s19
+; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; SI-NEXT: v_mov_b32_e32 v0, s18
+; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
+; SI-NEXT: v_mov_b32_e32 v0, s17
+; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
+; SI-NEXT: v_mov_b32_e32 v0, s16
+; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT: v_mov_b32_e32 v4, s23
+; SI-NEXT: v_alignbit_b32 v7, s15, s15, v4
+; SI-NEXT: v_mov_b32_e32 v4, s22
+; SI-NEXT: v_alignbit_b32 v6, s14, s14, v4
+; SI-NEXT: v_mov_b32_e32 v4, s21
+; SI-NEXT: v_alignbit_b32 v5, s13, s13, v4
+; SI-NEXT: v_mov_b32_e32 v4, s20
+; SI-NEXT: v_alignbit_b32 v4, s12, s12, v4
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX8-LABEL: rotr_v8i32:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s18
+; GFX8-NEXT: v_mov_b32_e32 v4, s17
+; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1
+; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, s23
+; GFX8-NEXT: v_alignbit_b32 v7, s15, s15, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, s22
+; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_alignbit_b32 v6, s14, s14, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, s21
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_alignbit_b32 v5, s13, s13, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
+; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s19
+; GFX8-NEXT: v_alignbit_b32 v4, s12, s12, v4
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s16
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: rotr_v8i32:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v7, s15, s15, s23
+; GFX10-NEXT: v_alignbit_b32 v6, s14, s14, s22
+; GFX10-NEXT: v_alignbit_b32 v5, s13, s13, s21
+; GFX10-NEXT: v_alignbit_b32 v4, s12, s12, s20
+; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s19
+; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s18
+; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s17
+; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s16
+; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: rotr_v8i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v7, s15, s15, s23
+; GFX11-NEXT: v_alignbit_b32 v6, s14, s14, s22
+; GFX11-NEXT: v_alignbit_b32 v5, s13, s13, s21
+; GFX11-NEXT: v_alignbit_b32 v4, s12, s12, s20
+; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s19
+; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s18
+; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s17
+; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s16
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
+entry:
+ %tmp0 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %y
+ %tmp1 = shl <8 x i32> %x, %tmp0
+ %tmp2 = lshr <8 x i32> %x, %y
+ %tmp3 = or <8 x i32> %tmp1, %tmp2
+ store <8 x i32> %tmp3, ptr addrspace(1) %in
+ ret void
+}
+
declare i16 @llvm.fshr.i16(i16, i16, i16)
define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
index d496634ae474f..8af4a8de7b266 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
@@ -18,11 +18,11 @@ define <2 x i32> @test_add2x32(ptr %a_ptr, ptr %b_ptr) {
; CHECK-LABEL: test_add2x32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dword v4, v[2:3]
-; CHECK-NEXT: flat_load_dword v5, v[0:1]
-; CHECK-NEXT: v_mov_b32_e32 v1, 48
+; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_or_b32_e32 v0, v5, v4
+; CHECK-NEXT: v_or_b32_e32 v1, v5, v7
+; CHECK-NEXT: v_or_b32_e32 v0, v4, v6
; CHECK-NEXT: s_setpc_b64 s[30:31]
%a = load <2 x i32>, ptr %a_ptr, !range !2, !noundef !{}
%b = load <2 x i32>, ptr %b_ptr, !range !3, !noundef !{}
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 00bb7b24786f5..3808c73ae7de3 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -1,6 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SIS %s
+; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VIS %s
+
+;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_xor_b64, particularly in the v2i32 case. See
+;; SWDEV-517886.
+;; Also removed the previously unused "GCN" check-prefixes from the test.
define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
; SI-LABEL: xor_v2i32:
@@ -21,8 +27,8 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v1, v3, v1
; SI-NEXT: v_xor_b32_e32 v0, v2, v0
+; SI-NEXT: v_xor_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -40,10 +46,43 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: v_xor_b32_e32 v0, v0, v2
+; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: xor_v2i32:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s5
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: xor_v2i32:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
+; VIS-NEXT: v_mov_b32_e32 v3, s1
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
+
%a = load <2 x i32>, ptr addrspace(1) %in0
%b = load <2 x i32>, ptr addrspace(1) %in1
%result = xor <2 x i32> %a, %b
@@ -97,6 +136,48 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_xor_b32_e32 v0, v0, v4
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: xor_v4i32:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; SIS-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b32 s7, s7, s11
+; SIS-NEXT: s_xor_b32 s6, s6, s10
+; SIS-NEXT: s_xor_b32 s5, s5, s9
+; SIS-NEXT: s_xor_b32 s4, s4, s8
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s5
+; SIS-NEXT: v_mov_b32_e32 v2, s6
+; SIS-NEXT: v_mov_b32_e32 v3, s7
+; SIS-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: xor_v4i32:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VIS-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; VIS-NEXT: v_mov_b32_e32 v4, s0
+; VIS-NEXT: v_mov_b32_e32 v5, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b32 s0, s7, s11
+; VIS-NEXT: s_xor_b32 s1, s6, s10
+; VIS-NEXT: s_xor_b32 s2, s5, s9
+; VIS-NEXT: s_xor_b32 s3, s4, s8
+; VIS-NEXT: v_mov_b32_e32 v0, s3
+; VIS-NEXT: v_mov_b32_e32 v1, s2
+; VIS-NEXT: v_mov_b32_e32 v2, s1
+; VIS-NEXT: v_mov_b32_e32 v3, s0
+; VIS-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VIS-NEXT: s_endpgm
%a = load <4 x i32>, ptr addrspace(1) %in0
%b = load <4 x i32>, ptr addrspace(1) %in1
%result = xor <4 x i32> %a, %b
@@ -152,6 +233,47 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: xor_i1:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SIS-NEXT: s_mov_b32 s7, 0xf000
+; SIS-NEXT: s_mov_b32 s6, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dword s8, s[2:3], 0x0
+; SIS-NEXT: s_load_dword s9, s[4:5], 0x0
+; SIS-NEXT: s_mov_b32 s4, s0
+; SIS-NEXT: s_mov_b32 s5, s1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: v_cmp_ge_f32_e64 s[0:1], s8, 0
+; SIS-NEXT: v_cmp_ge_f32_e64 s[2:3], s9, 1.0
+; SIS-NEXT: v_mov_b32_e32 v0, s9
+; SIS-NEXT: v_mov_b32_e32 v1, s8
+; SIS-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; SIS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: xor_i1:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dword s6, s[2:3], 0x0
+; VIS-NEXT: s_load_dword s4, s[4:5], 0x0
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: v_cmp_ge_f32_e64 s[0:1], s6, 0
+; VIS-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, 1.0
+; VIS-NEXT: v_mov_b32_e32 v2, s4
+; VIS-NEXT: v_mov_b32_e32 v3, s6
+; VIS-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; VIS-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; VIS-NEXT: flat_store_dword v[0:1], v2
+; VIS-NEXT: s_endpgm
+
%a = load float, ptr addrspace(1) %in0
%b = load float, ptr addrspace(1) %in1
%acmp = fcmp oge float %a, 0.000000e+00
@@ -206,6 +328,50 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
; VI-NEXT: v_and_b32_e32 v2, 1, v2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: v_xor_i1:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SIS-NEXT: s_mov_b32 s7, 0xf000
+; SIS-NEXT: s_mov_b32 s6, -1
+; SIS-NEXT: s_mov_b32 s14, s6
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_mov_b32 s12, s2
+; SIS-NEXT: s_mov_b32 s13, s3
+; SIS-NEXT: s_mov_b32 s15, s7
+; SIS-NEXT: s_mov_b32 s10, s6
+; SIS-NEXT: s_mov_b32 s11, s7
+; SIS-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc
+; SIS-NEXT: s_waitcnt vmcnt(0)
+; SIS-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc
+; SIS-NEXT: s_waitcnt vmcnt(0)
+; SIS-NEXT: s_mov_b32 s4, s0
+; SIS-NEXT: s_mov_b32 s5, s1
+; SIS-NEXT: v_xor_b32_e32 v0, v0, v1
+; SIS-NEXT: v_and_b32_e32 v0, 1, v0
+; SIS-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: v_xor_i1:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: v_mov_b32_e32 v0, s2
+; VIS-NEXT: v_mov_b32_e32 v1, s3
+; VIS-NEXT: v_mov_b32_e32 v2, s4
+; VIS-NEXT: v_mov_b32_e32 v3, s5
+; VIS-NEXT: flat_load_ubyte v4, v[0:1] glc
+; VIS-NEXT: s_waitcnt vmcnt(0)
+; VIS-NEXT: flat_load_ubyte v2, v[2:3] glc
+; VIS-NEXT: s_waitcnt vmcnt(0)
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: v_xor_b32_e32 v2, v4, v2
+; VIS-NEXT: v_and_b32_e32 v2, 1, v2
+; VIS-NEXT: flat_store_byte v[0:1], v2
+; VIS-NEXT: s_endpgm
%a = load volatile i1, ptr addrspace(1) %in0
%b = load volatile i1, ptr addrspace(1) %in1
%xor = xor i1 %a, %b
@@ -253,6 +419,36 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_xor_b32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: vector_xor_i32:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dword s6, s[2:3], 0x0
+; SIS-NEXT: s_load_dword s4, s[4:5], 0x0
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b32 s4, s6, s4
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: vector_xor_i32:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dword s2, s[2:3], 0x0
+; VIS-NEXT: s_load_dword s3, s[4:5], 0x0
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b32 s0, s2, s3
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dword v[0:1], v2
+; VIS-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in0
%b = load i32, ptr addrspace(1) %in1
%result = xor i32 %a, %b
@@ -284,6 +480,30 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: scalar_xor_i32:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_mov_b32 s7, 0xf000
+; SIS-NEXT: s_mov_b32 s6, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_mov_b32 s4, s0
+; SIS-NEXT: s_xor_b32 s0, s2, s3
+; SIS-NEXT: s_mov_b32 s5, s1
+; SIS-NEXT: v_mov_b32_e32 v0, s0
+; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: scalar_xor_i32:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b32 s2, s2, s3
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: v_mov_b32_e32 v2, s2
+; VIS-NEXT: flat_store_dword v[0:1], v2
+; VIS-NEXT: s_endpgm
%result = xor i32 %a, %b
store i32 %result, ptr addrspace(1) %out
ret void
@@ -313,6 +533,30 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: scalar_not_i32:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dword s6, s[4:5], 0xb
+; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_not_b32 s4, s6
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: scalar_not_i32:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_not_b32 s2, s2
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: v_mov_b32_e32 v2, s2
+; VIS-NEXT: flat_store_dword v[0:1], v2
+; VIS-NEXT: s_endpgm
%result = xor i32 %a, -1
store i32 %result, ptr addrspace(1) %out
ret void
@@ -350,6 +594,32 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: vector_not_i32:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dword s4, s[2:3], 0x0
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_not_b32 s4, s4
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: vector_not_i32:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dword s2, s[2:3], 0x0
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_not_b32 s0, s2
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dword v[0:1], v2
+; VIS-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in0
%b = load i32, ptr addrspace(1) %in1
%result = xor i32 %a, -1
@@ -399,6 +669,38 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: vector_xor_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s5
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: vector_xor_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
+; VIS-NEXT: v_mov_b32_e32 v3, s1
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in0
%b = load i64, ptr addrspace(1) %in1
%result = xor i64 %a, %b
@@ -434,6 +736,34 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: scalar_xor_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SIS-NEXT: s_mov_b32 s7, 0xf000
+; SIS-NEXT: s_mov_b32 s6, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_mov_b32 s4, s0
+; SIS-NEXT: s_mov_b32 s5, s1
+; SIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9]
+; SIS-NEXT: v_mov_b32_e32 v0, s0
+; SIS-NEXT: v_mov_b32_e32 v1, s1
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: scalar_xor_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
+; VIS-NEXT: v_mov_b32_e32 v3, s1
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
%result = xor i64 %a, %b
store i64 %result, ptr addrspace(1) %out
ret void
@@ -465,6 +795,32 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: scalar_not_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_mov_b32 s7, 0xf000
+; SIS-NEXT: s_mov_b32 s6, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_mov_b32 s4, s0
+; SIS-NEXT: s_mov_b32 s5, s1
+; SIS-NEXT: s_not_b64 s[0:1], s[2:3]
+; SIS-NEXT: v_mov_b32_e32 v0, s0
+; SIS-NEXT: v_mov_b32_e32 v1, s1
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: scalar_not_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_not_b64 s[0:1], s[2:3]
+; VIS-NEXT: v_mov_b32_e32 v3, s1
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
%result = xor i64 %a, -1
store i64 %result, ptr addrspace(1) %out
ret void
@@ -504,6 +860,34 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_not_b32_e32 v1, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: vector_not_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_not_b64 s[4:5], s[4:5]
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s5
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: vector_not_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_not_b64 s[0:1], s[2:3]
+; VIS-NEXT: v_mov_b32_e32 v3, s1
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in0
%b = load i64, ptr addrspace(1) %in1
%result = xor i64 %a, -1
@@ -570,6 +954,59 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
; VI-NEXT: .LBB12_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
; VI-NEXT: s_branch .LBB12_2
+;
+; SIS-LABEL: xor_cf:
+; SIS: ; %bb.0: ; %entry
+; SIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SIS-NEXT: s_mov_b64 s[10:11], 0
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0
+; SIS-NEXT: s_and_b64 vcc, exec, s[8:9]
+; SIS-NEXT: s_cbranch_vccz .LBB12_4
+; SIS-NEXT: ; %bb.1: ; %else
+; SIS-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
+; SIS-NEXT: s_andn2_b64 vcc, exec, s[10:11]
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_mov_b64 vcc, vcc
+; SIS-NEXT: s_cbranch_vccnz .LBB12_3
+; SIS-NEXT: .LBB12_2: ; %if
+; SIS-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
+; SIS-NEXT: .LBB12_3: ; %endif
+; SIS-NEXT: v_mov_b32_e32 v0, s8
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: v_mov_b32_e32 v1, s9
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+; SIS-NEXT: .LBB12_4:
+; SIS-NEXT: ; implicit-def: $sgpr8_sgpr9
+; SIS-NEXT: s_branch .LBB12_2
+;
+; VIS-LABEL: xor_cf:
+; VIS: ; %bb.0: ; %entry
+; VIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VIS-NEXT: s_mov_b64 s[8:9], 0
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VIS-NEXT: s_cbranch_scc0 .LBB12_4
+; VIS-NEXT: ; %bb.1: ; %else
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; VIS-NEXT: s_cbranch_vccnz .LBB12_3
+; VIS-NEXT: .LBB12_2: ; %if
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
+; VIS-NEXT: .LBB12_3: ; %endif
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: v_mov_b32_e32 v2, s2
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: v_mov_b32_e32 v3, s3
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
+; VIS-NEXT: .LBB12_4:
+; VIS-NEXT: ; implicit-def: $sgpr2_sgpr3
+; VIS-NEXT: s_branch .LBB12_2
entry:
%0 = icmp eq i64 %a, 0
br i1 %0, label %if, label %else
@@ -616,6 +1053,34 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: scalar_xor_literal_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b32 s4, s7, 0xf237b
+; SIS-NEXT: s_xor_b32 s5, s6, 0x3039
+; SIS-NEXT: v_mov_b32_e32 v0, s5
+; SIS-NEXT: v_mov_b32_e32 v1, s4
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: scalar_xor_literal_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b32 s1, s1, 0xf237b
+; VIS-NEXT: s_xor_b32 s0, s0, 0x3039
+; VIS-NEXT: v_mov_b32_e32 v2, s2
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: v_mov_b32_e32 v3, s3
+; VIS-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VIS-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
ret void
@@ -664,6 +1129,49 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: scalar_xor_literal_multi_use_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x13
+; SIS-NEXT: s_movk_i32 s8, 0x3039
+; SIS-NEXT: s_mov_b32 s9, 0xf237b
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s5
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_add_u32 s0, s6, 0x3039
+; SIS-NEXT: s_addc_u32 s1, s7, 0xf237b
+; SIS-NEXT: s_waitcnt expcnt(0)
+; SIS-NEXT: v_mov_b32_e32 v0, s0
+; SIS-NEXT: v_mov_b32_e32 v1, s1
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_waitcnt vmcnt(0)
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: scalar_xor_literal_multi_use_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c
+; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VIS-NEXT: s_movk_i32 s6, 0x3039
+; VIS-NEXT: s_mov_b32 s7, 0xf237b
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; VIS-NEXT: v_mov_b32_e32 v0, s4
+; VIS-NEXT: v_mov_b32_e32 v3, s1
+; VIS-NEXT: v_mov_b32_e32 v1, s5
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: s_add_u32 s0, s2, 0x3039
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_addc_u32 s1, s3, 0xf237b
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
+; VIS-NEXT: s_waitcnt vmcnt(0)
+; VIS-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
@@ -698,6 +1206,32 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: scalar_xor_inline_imm_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b32 s4, s6, 63
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s7
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: scalar_xor_inline_imm_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b32 s0, s0, 63
+; VIS-NEXT: v_mov_b32_e32 v2, s2
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v3, s3
+; VIS-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VIS-NEXT: s_endpgm
%or = xor i64 %a, 63
store i64 %or, ptr addrspace(1) %out
ret void
@@ -729,6 +1263,33 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: scalar_xor_neg_inline_imm_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], -8
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s5
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: scalar_xor_neg_inline_imm_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b64 s[0:1], s[0:1], -8
+; VIS-NEXT: v_mov_b32_e32 v0, s2
+; VIS-NEXT: v_mov_b32_e32 v3, s1
+; VIS-NEXT: v_mov_b32_e32 v1, s3
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
+
%or = xor i64 %a, -8
store i64 %or, ptr addrspace(1) %out
ret void
@@ -768,6 +1329,34 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out,
; VI-NEXT: v_xor_b32_e32 v1, -1, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: vector_xor_i64_neg_inline_imm:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b64 s[4:5], s[4:5], -8
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s5
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: vector_xor_i64_neg_inline_imm:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], -8
+; VIS-NEXT: v_mov_b32_e32 v3, s1
+; VIS-NEXT: v_mov_b32_e32 v2, s0
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
%loada = load i64, ptr addrspace(1) %a, align 8
%or = xor i64 %loada, -8
store i64 %or, ptr addrspace(1) %out
@@ -808,10 +1397,39 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; SIS-LABEL: vector_xor_literal_i64:
+; SIS: ; %bb.0:
+; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SIS-NEXT: s_mov_b32 s3, 0xf000
+; SIS-NEXT: s_mov_b32 s2, -1
+; SIS-NEXT: s_waitcnt lgkmcnt(0)
+; SIS-NEXT: s_xor_b32 s5, s5, 0x146f
+; SIS-NEXT: s_xor_b32 s4, s4, 0xdf77987f
+; SIS-NEXT: v_mov_b32_e32 v0, s4
+; SIS-NEXT: v_mov_b32_e32 v1, s5
+; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SIS-NEXT: s_endpgm
+;
+; VIS-LABEL: vector_xor_literal_i64:
+; VIS: ; %bb.0:
+; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VIS-NEXT: v_mov_b32_e32 v0, s0
+; VIS-NEXT: v_mov_b32_e32 v1, s1
+; VIS-NEXT: s_waitcnt lgkmcnt(0)
+; VIS-NEXT: s_xor_b32 s0, s3, 0x146f
+; VIS-NEXT: s_xor_b32 s1, s2, 0xdf77987f
+; VIS-NEXT: v_mov_b32_e32 v2, s1
+; VIS-NEXT: v_mov_b32_e32 v3, s0
+; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VIS-NEXT: s_endpgm
+
%loada = load i64, ptr addrspace(1) %a, align 8
%or = xor i64 %loada, 22470723082367
store i64 %or, ptr addrspace(1) %out
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
>From 1a96d4b16dcf0630c593106f25fdfa8c7cf4dd39 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Thu, 19 Jun 2025 09:03:24 -0500
Subject: [PATCH 02/15] Remove over-enthusiastic clang-format
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index facb183b89531..13f537ec14055 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4025,8 +4025,9 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
/// binary operation \p Opc to it with the corresponding constant operands.
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
- DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
- uint32_t ValLo, uint32_t ValHi) const {
+ DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const {
SelectionDAG &DAG = DCI.DAG;
SDValue Lo, Hi;
std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
>From afbe7ccf49040fd379cbcdbfc294e8085d1ae459 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 23 Jun 2025 10:35:11 -0500
Subject: [PATCH 03/15] Respond to some review comments
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +++
llvm/test/CodeGen/AMDGPU/or.ll | 3 ---
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 80bccf5274829..660c00fbeb11e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13250,6 +13250,9 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ // Fold the fneg of a vselect into the v2 vselect operands.
+ // xor (vselect c, a, b), 0x80000000 ->
+ // bitcast (vselect c, (fneg (bitcast a)), (fneg (bitcast b)))
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 26751b289a385..b55c6423a0de8 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -5,9 +5,6 @@
; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8S %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
-;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_or_b64, particularly in the v2i32 case. See SWDEV-517886.
-;; Also removed the previously unused "GCN" check-prefixes from the test.
-
define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX6-LABEL: or_v2i32:
; GFX6: ; %bb.0:
>From d5473051bc020a642586320db35b357e264393f7 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 23 Jun 2025 10:48:35 -0500
Subject: [PATCH 04/15] Add reviewer requested tests
---
llvm/test/CodeGen/AMDGPU/or.ll | 102 +++++++++++---------------------
llvm/test/CodeGen/AMDGPU/xor.ll | 98 ++++++++++++++----------------
2 files changed, 76 insertions(+), 124 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index b55c6423a0de8..f4855c0056b53 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -1,9 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6S %s
-; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8S %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+
+
+define amdgpu_ps <2 x i32> @s_or_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
+; GFX6-LABEL: s_or_v2i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_or_v2i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: ; return to shader part epilog
+ %result = or <2 x i32> %num, %den
+ ret <2 x i32> %result
+}
+
+define <2 x i32> @v_or_v2i32(<2 x i32> %num, <2 x i32> %den) {
+; GFX6-LABEL: v_or_v2i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_or_v2i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %result = or <2 x i32> %num, %den
+ ret <2 x i32> %result
+}
define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX6-LABEL: or_v2i32:
@@ -43,7 +73,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: or_v2i32:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -57,7 +86,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX6S-NEXT: v_mov_b32_e32 v1, s5
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: or_v2i32:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -71,7 +99,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8S-NEXT: v_mov_b32_e32 v1, s5
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: or_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -141,7 +168,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: or_v4i32:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
@@ -160,7 +186,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX6S-NEXT: v_mov_b32_e32 v3, s3
; GFX6S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: or_v4i32:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
@@ -179,7 +204,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8S-NEXT: v_mov_b32_e32 v3, s3
; GFX8S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: or_v4i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -234,7 +258,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_or_i32:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -247,7 +270,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; GFX6S-NEXT: v_mov_b32_e32 v0, s0
; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_or_i32:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -260,7 +282,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; GFX8S-NEXT: v_mov_b32_e32 v0, s0
; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_or_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -314,7 +335,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: v_or_b32_e32 v0, s12, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: vector_or_i32:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -328,7 +348,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX6S-NEXT: v_mov_b32_e32 v0, s4
; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: vector_or_i32:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -342,7 +361,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8S-NEXT: v_mov_b32_e32 v0, s4
; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: vector_or_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -389,7 +407,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_or_literal_i32:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dword s6, s[4:5], 0xb
@@ -401,7 +418,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
; GFX6S-NEXT: v_mov_b32_e32 v0, s4
; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_or_literal_i32:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x2c
@@ -413,7 +429,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
; GFX8S-NEXT: v_mov_b32_e32 v0, s4
; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_or_literal_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -457,7 +472,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_or_literal_i64:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
@@ -471,7 +485,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
; GFX6S-NEXT: v_mov_b32_e32 v1, s4
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_or_literal_i64:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
@@ -485,7 +498,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
; GFX8S-NEXT: v_mov_b32_e32 v1, s4
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_or_literal_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -548,7 +560,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_or_literal_multi_use_i64:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
@@ -571,7 +582,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_waitcnt vmcnt(0)
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_or_literal_multi_use_i64:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
@@ -593,7 +603,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_waitcnt vmcnt(0)
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_or_literal_multi_use_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[]
@@ -650,7 +659,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_or_inline_imm_i64:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
@@ -663,7 +671,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
; GFX6S-NEXT: v_mov_b32_e32 v1, s7
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_or_inline_imm_i64:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
@@ -676,7 +683,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
; GFX8S-NEXT: v_mov_b32_e32 v1, s7
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_or_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -736,7 +742,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_or_inline_imm_multi_use_i64:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -758,7 +763,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6S-NEXT: s_waitcnt vmcnt(0)
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_or_inline_imm_multi_use_i64:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -779,7 +783,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8S-NEXT: s_waitcnt vmcnt(0)
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_or_inline_imm_multi_use_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[]
@@ -832,7 +835,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_or_neg_inline_imm_i64:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13
@@ -845,7 +847,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
; GFX6S-NEXT: v_mov_b32_e32 v0, s4
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_or_neg_inline_imm_i64:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c
@@ -858,7 +859,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
; GFX8S-NEXT: v_mov_b32_e32 v0, s4
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_or_neg_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -912,7 +912,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: vector_or_literal_i32:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -925,7 +924,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
; GFX6S-NEXT: v_mov_b32_e32 v0, s4
; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: vector_or_literal_i32:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -938,7 +936,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
; GFX8S-NEXT: v_mov_b32_e32 v0, s4
; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: vector_or_literal_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -997,7 +994,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
; GFX8-NEXT: v_or_b32_e32 v0, 4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: vector_or_inline_immediate_i32:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1010,7 +1006,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
; GFX6S-NEXT: v_mov_b32_e32 v0, s4
; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: vector_or_inline_immediate_i32:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1023,7 +1018,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
; GFX8S-NEXT: v_mov_b32_e32 v0, s4
; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: vector_or_inline_immediate_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1076,7 +1070,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_or_i64:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1091,7 +1084,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX6S-NEXT: v_mov_b32_e32 v1, s1
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_or_i64:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1106,7 +1098,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX8S-NEXT: v_mov_b32_e32 v1, s1
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -1169,7 +1160,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: vector_or_i64:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1185,7 +1175,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX6S-NEXT: v_mov_b32_e32 v1, s5
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: vector_or_i64:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1201,7 +1190,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8S-NEXT: v_mov_b32_e32 v1, s5
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: vector_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -1268,7 +1256,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: v_or_b32_e32 v1, s13, v1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: scalar_vector_or_i64:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1283,7 +1270,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs
; GFX6S-NEXT: v_mov_b32_e32 v1, s5
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: scalar_vector_or_i64:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1298,7 +1284,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs
; GFX8S-NEXT: v_mov_b32_e32 v1, s5
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: scalar_vector_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1360,7 +1345,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: vector_or_i64_loadimm:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1375,7 +1359,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
; GFX6S-NEXT: v_mov_b32_e32 v1, s5
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: vector_or_i64_loadimm:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1390,7 +1373,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
; GFX8S-NEXT: v_mov_b32_e32 v1, s5
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: vector_or_i64_loadimm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1452,7 +1434,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
; GFX8-NEXT: v_or_b32_e32 v0, 8, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: vector_or_i64_imm:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1466,7 +1447,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
; GFX6S-NEXT: v_mov_b32_e32 v1, s5
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: vector_or_i64_imm:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1480,7 +1460,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
; GFX8S-NEXT: v_mov_b32_e32 v1, s5
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: vector_or_i64_imm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1541,7 +1520,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
; GFX8-NEXT: v_or_b32_e32 v0, -8, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: vector_or_i64_neg_inline_imm:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1555,7 +1533,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
; GFX6S-NEXT: v_mov_b32_e32 v0, s4
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: vector_or_i64_neg_inline_imm:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1569,7 +1546,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
; GFX8S-NEXT: v_mov_b32_e32 v0, s4
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: vector_or_i64_neg_inline_imm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1632,7 +1608,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: vector_or_i64_neg_literal:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1646,7 +1621,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
; GFX6S-NEXT: v_mov_b32_e32 v0, s4
; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: vector_or_i64_neg_literal:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1660,7 +1634,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
; GFX8S-NEXT: v_mov_b32_e32 v0, s4
; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: vector_or_i64_neg_literal:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1711,7 +1684,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: trunc_i64_or_to_i32:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13
@@ -1724,7 +1696,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
; GFX6S-NEXT: v_mov_b32_e32 v0, s4
; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: trunc_i64_or_to_i32:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c
@@ -1737,7 +1708,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
; GFX8S-NEXT: v_mov_b32_e32 v0, s4
; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: trunc_i64_or_to_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -1808,7 +1778,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: or_i1:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1828,7 +1797,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: or_i1:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
@@ -1848,7 +1816,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: or_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -1911,7 +1878,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; GFX6S-LABEL: s_or_i1:
; GFX6S: ; %bb.0:
; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
@@ -1927,7 +1893,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX6S-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6S-NEXT: s_endpgm
-;
; GFX8S-LABEL: s_or_i1:
; GFX8S: ; %bb.0:
; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -1943,7 +1908,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8S-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX8S-NEXT: s_endpgm
-;
; EG-LABEL: s_or_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 3808c73ae7de3..d7e780a5ddf74 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -1,12 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SIS %s
-; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VIS %s
-;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_xor_b64, particularly in the v2i32 case. See
-;; SWDEV-517886.
-;; Also removed the previously unused "GCN" check-prefixes from the test.
+define amdgpu_ps <2 x i32> @s_xor_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
+; SI-LABEL: s_xor_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_xor_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT: ; return to shader part epilog
+ %result = xor <2 x i32> %num, %den
+ ret <2 x i32> %result
+}
+
+define <2 x i32> @v_xor_v2i32(<2 x i32> %num, <2 x i32> %den) {
+; SI-LABEL: v_xor_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v1, v1, v3
+; SI-NEXT: v_xor_b32_e32 v0, v0, v2
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_xor_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v1, v1, v3
+; VI-NEXT: v_xor_b32_e32 v0, v0, v2
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %result = xor <2 x i32> %num, %den
+ ret <2 x i32> %result
+}
define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
; SI-LABEL: xor_v2i32:
@@ -50,7 +76,6 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: xor_v2i32:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -66,7 +91,6 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; SIS-NEXT: v_mov_b32_e32 v1, s5
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: xor_v2i32:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -136,7 +160,6 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_xor_b32_e32 v0, v0, v4
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: xor_v4i32:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -157,7 +180,6 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; SIS-NEXT: v_mov_b32_e32 v3, s7
; SIS-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: xor_v4i32:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -233,7 +255,6 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: xor_i1:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -254,7 +275,6 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
; SIS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: xor_i1:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -328,7 +348,6 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
; VI-NEXT: v_and_b32_e32 v2, 1, v2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: v_xor_i1:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -352,7 +371,6 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
; SIS-NEXT: v_and_b32_e32 v0, 1, v0
; SIS-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: v_xor_i1:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -419,7 +437,6 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_xor_b32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: vector_xor_i32:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -434,7 +451,6 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
; SIS-NEXT: v_mov_b32_e32 v0, s4
; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: vector_xor_i32:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -480,7 +496,6 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: scalar_xor_i32:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -493,7 +508,6 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; SIS-NEXT: v_mov_b32_e32 v0, s0
; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: scalar_xor_i32:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -533,7 +547,6 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: scalar_not_i32:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dword s6, s[4:5], 0xb
@@ -545,7 +558,6 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
; SIS-NEXT: v_mov_b32_e32 v0, s4
; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: scalar_not_i32:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dword s2, s[4:5], 0x2c
@@ -594,7 +606,6 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: vector_not_i32:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -607,7 +618,6 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
; SIS-NEXT: v_mov_b32_e32 v0, s4
; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: vector_not_i32:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -669,7 +679,6 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: vector_xor_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -685,7 +694,6 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
; SIS-NEXT: v_mov_b32_e32 v1, s5
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: vector_xor_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -736,7 +744,6 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: scalar_xor_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -751,7 +758,6 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; SIS-NEXT: v_mov_b32_e32 v1, s1
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: scalar_xor_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -795,7 +801,6 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: scalar_not_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -809,7 +814,6 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
; SIS-NEXT: v_mov_b32_e32 v1, s1
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: scalar_not_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -860,7 +864,6 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_not_b32_e32 v1, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: vector_not_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -874,7 +877,6 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
; SIS-NEXT: v_mov_b32_e32 v1, s5
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: vector_not_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -903,7 +905,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
; SI-NEXT: s_and_b64 vcc, exec, s[10:11]
-; SI-NEXT: s_cbranch_vccz .LBB12_4
+; SI-NEXT: s_cbranch_vccz .LBB14_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s15, 0xf000
; SI-NEXT: s_mov_b32 s14, -1
@@ -911,21 +913,21 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; SI-NEXT: s_cbranch_vccnz .LBB12_3
-; SI-NEXT: .LBB12_2: ; %if
+; SI-NEXT: s_cbranch_vccnz .LBB14_3
+; SI-NEXT: .LBB14_2: ; %if
; SI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: .LBB12_3: ; %endif
+; SI-NEXT: .LBB14_3: ; %endif
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
-; SI-NEXT: .LBB12_4:
+; SI-NEXT: .LBB14_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
-; SI-NEXT: s_branch .LBB12_2
+; SI-NEXT: s_branch .LBB14_2
;
; VI-LABEL: xor_cf:
; VI: ; %bb.0: ; %entry
@@ -933,28 +935,27 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
; VI-NEXT: s_mov_b64 s[8:9], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
-; VI-NEXT: s_cbranch_scc0 .LBB12_4
+; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; VI-NEXT: s_cbranch_vccnz .LBB12_3
-; VI-NEXT: .LBB12_2: ; %if
+; VI-NEXT: s_cbranch_vccnz .LBB14_3
+; VI-NEXT: .LBB14_2: ; %if
; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: .LBB12_3: ; %endif
+; VI-NEXT: .LBB14_3: ; %endif
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-; VI-NEXT: .LBB12_4:
+; VI-NEXT: .LBB14_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
-; VI-NEXT: s_branch .LBB12_2
-;
+; VI-NEXT: s_branch .LBB14_2
; SIS-LABEL: xor_cf:
; SIS: ; %bb.0: ; %entry
; SIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -981,7 +982,6 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
; SIS-NEXT: .LBB12_4:
; SIS-NEXT: ; implicit-def: $sgpr8_sgpr9
; SIS-NEXT: s_branch .LBB12_2
-;
; VIS-LABEL: xor_cf:
; VIS: ; %bb.0: ; %entry
; VIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
@@ -1053,7 +1053,6 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: scalar_xor_literal_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
@@ -1067,7 +1066,6 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
; SIS-NEXT: v_mov_b32_e32 v1, s4
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: scalar_xor_literal_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
@@ -1129,7 +1127,6 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: scalar_xor_literal_multi_use_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
@@ -1151,7 +1148,6 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_waitcnt vmcnt(0)
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: scalar_xor_literal_multi_use_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c
@@ -1206,7 +1202,6 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: scalar_xor_inline_imm_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
@@ -1219,7 +1214,6 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
; SIS-NEXT: v_mov_b32_e32 v1, s7
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: scalar_xor_inline_imm_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
@@ -1263,7 +1257,6 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: scalar_xor_neg_inline_imm_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
@@ -1276,7 +1269,6 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
; SIS-NEXT: v_mov_b32_e32 v1, s5
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: scalar_xor_neg_inline_imm_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
@@ -1329,7 +1321,6 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out,
; VI-NEXT: v_xor_b32_e32 v1, -1, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: vector_xor_i64_neg_inline_imm:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1343,7 +1334,6 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out,
; SIS-NEXT: v_mov_b32_e32 v1, s5
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: vector_xor_i64_neg_inline_imm:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -1397,7 +1387,6 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-;
; SIS-LABEL: vector_xor_literal_i64:
; SIS: ; %bb.0:
; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1412,7 +1401,6 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add
; SIS-NEXT: v_mov_b32_e32 v1, s5
; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SIS-NEXT: s_endpgm
-;
; VIS-LABEL: vector_xor_literal_i64:
; VIS: ; %bb.0:
; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
>From fa2fa04a91f07da3fe36056502d7592c1d2de0a4 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 23 Jun 2025 10:54:16 -0500
Subject: [PATCH 05/15] Suppress over-enthusiastic clang-format
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 ++--
llvm/test/CodeGen/AMDGPU/or.ll | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 13f537ec14055..aa091801a218e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4865,8 +4865,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
if (Inv)
std::swap(NewLHS, NewRHS);
- SDValue NewSelect =
- DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
+ Cond, NewLHS, NewRHS);
DCI.AddToWorklist(NewSelect.getNode());
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
}
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index f4855c0056b53..b1d9c665ebf08 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
define amdgpu_ps <2 x i32> @s_or_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GFX6-LABEL: s_or_v2i32:
>From 601beec61cfac77caa1a1c0a8f3f440d9aec5c72 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 23 Jun 2025 13:01:29 -0500
Subject: [PATCH 06/15] Temporarily remove r600 from or.ll test
---
llvm/test/CodeGen/AMDGPU/or.ll | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index b1d9c665ebf08..0a71a644652fe 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
define amdgpu_ps <2 x i32> @s_or_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GFX6-LABEL: s_or_v2i32:
>From 50ecdffd28f7fdf40e2bf6819482d8b00f06c58a Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Jun 2025 06:11:18 -0500
Subject: [PATCH 07/15] Add SGPR and VGPR tests to and.ll and temporarily
remove the r600 run line.
---
llvm/test/CodeGen/AMDGPU/and.ll | 79 ++++++++++++++-------------------
1 file changed, 34 insertions(+), 45 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index e5fe9195e2dcc..4673df3183cfa 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -1,10 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
-; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
+define amdgpu_ps <2 x i32> @s_and_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
+; GFX6-LABEL: s_and_v2i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_and_v2i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: ; return to shader part epilog
+ %result = and <2 x i32> %num, %den
+ ret <2 x i32> %result
+}
+
+define <2 x i32> @v_and_v2i32(<2 x i32> %num, <2 x i32> %den) {
+; GFX6-LABEL: v_and_v2i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_and_v2i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX8-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %result = and <2 x i32> %num, %den
+ ret <2 x i32> %result
+}
+
define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX6-LABEL: test2:
; GFX6: ; %bb.0:
@@ -14,8 +45,7 @@ define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s5, s5, s7
-; GFX6-NEXT: s_and_b32 s4, s4, s6
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -29,13 +59,11 @@ define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s5, s5, s7
-; GFX8-NEXT: s_and_b32 s4, s4, s6
+; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: test2:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -99,7 +127,6 @@ define amdgpu_kernel void @test4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: test4:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -154,7 +181,6 @@ define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -194,7 +220,6 @@ define amdgpu_kernel void @s_and_constant_i32(ptr addrspace(1) %out, i32 %a) {
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_constant_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -252,7 +277,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_multi_use_constant_i32_0:
; EG: ; %bb.0:
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
@@ -309,7 +333,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_multi_use_constant_i32_1:
; EG: ; %bb.0:
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
@@ -371,7 +394,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_vgpr(ptr addrspace(1) %out, ptr addrsp
; GFX8-NEXT: v_and_b32_e32 v2, v5, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_i32_vgpr_vgpr:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
@@ -440,7 +462,6 @@ define amdgpu_kernel void @v_and_i32_sgpr_vgpr(ptr addrspace(1) %out, i32 %a, pt
; GFX8-NEXT: v_and_b32_e32 v2, s2, v3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_i32_sgpr_vgpr:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -504,7 +525,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(ptr addrspace(1) %out, ptr addrsp
; GFX8-NEXT: v_and_b32_e32 v2, s4, v3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_i32_vgpr_sgpr:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -568,7 +588,6 @@ define amdgpu_kernel void @v_and_constant_i32(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_constant_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -630,7 +649,6 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_and_b32_e32 v0, 64, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_inline_imm_64_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -692,7 +710,6 @@ define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_and_b32_e32 v0, -16, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_inline_imm_neg_16_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -749,7 +766,6 @@ define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -794,7 +810,6 @@ define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) {
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
@@ -857,7 +872,6 @@ define amdgpu_kernel void @s_and_constant_i64(ptr addrspace(1) %out, i64 %a) {
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_constant_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -921,7 +935,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_multi_use_constant_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 10, @6, KC0[CB0:0-32], KC1[]
@@ -975,7 +988,6 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_32_bit_constant_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -1046,7 +1058,6 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_multi_use_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 17, @6, KC0[CB0:0-32], KC1[]
@@ -1130,7 +1141,6 @@ define amdgpu_kernel void @v_and_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX8-NEXT: v_and_b32_e32 v0, v0, v2
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
@@ -1199,7 +1209,6 @@ define amdgpu_kernel void @v_and_constant_i64(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_and_b32_e32 v0, 0xab19b207, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_constant_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -1280,7 +1289,6 @@ define amdgpu_kernel void @v_and_multi_use_constant_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_multi_use_constant_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[]
@@ -1382,7 +1390,6 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_multi_use_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[]
@@ -1466,7 +1473,6 @@ define amdgpu_kernel void @v_and_i64_32_bit_constant(ptr addrspace(1) %out, ptr
; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_i64_32_bit_constant:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -1530,7 +1536,6 @@ define amdgpu_kernel void @v_and_inline_imm_i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: v_and_b32_e32 v0, 64, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -1595,7 +1600,6 @@ define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_and_b32_e32 v0, -8, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: v_and_inline_neg_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
@@ -1648,7 +1652,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_64_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -1699,7 +1702,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %ou
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_64_i64_noshrink:
; EG: ; %bb.0:
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
@@ -1748,7 +1750,6 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_1_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -1791,7 +1792,6 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_1.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -1835,7 +1835,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_neg_1.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -1879,7 +1878,6 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_0.5_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -1923,7 +1921,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_neg_0.5_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -1967,7 +1964,6 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_2.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -2011,7 +2007,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_neg_2.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -2055,7 +2050,6 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_4.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -2099,7 +2093,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_neg_4.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -2146,7 +2139,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_f32_4.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -2189,7 +2181,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %ou
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -2234,7 +2225,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -2278,7 +2268,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-;
; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
>From a5ba93d90ae8ba580d35664cc6d474a1504999bc Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Jun 2025 06:50:20 -0500
Subject: [PATCH 08/15] Remove dead check-lines from or.ll
---
llvm/test/CodeGen/AMDGPU/or.ll | 625 ---------------------------------
1 file changed, 625 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 0a71a644652fe..0da53f2a95953 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -72,32 +72,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: or_v2i32:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: v_mov_b32_e32 v1, s5
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: or_v2i32:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: v_mov_b32_e32 v1, s5
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: or_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -167,42 +141,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: or_v4i32:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX6S-NEXT: s_mov_b32 s11, 0xf000
-; GFX6S-NEXT: s_mov_b32 s10, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s3, s3, s7
-; GFX6S-NEXT: s_or_b32 s2, s2, s6
-; GFX6S-NEXT: s_or_b32 s1, s1, s5
-; GFX6S-NEXT: s_or_b32 s0, s0, s4
-; GFX6S-NEXT: v_mov_b32_e32 v0, s0
-; GFX6S-NEXT: v_mov_b32_e32 v1, s1
-; GFX6S-NEXT: v_mov_b32_e32 v2, s2
-; GFX6S-NEXT: v_mov_b32_e32 v3, s3
-; GFX6S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: or_v4i32:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX8S-NEXT: s_mov_b32 s11, 0xf000
-; GFX8S-NEXT: s_mov_b32 s10, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s3, s3, s7
-; GFX8S-NEXT: s_or_b32 s2, s2, s6
-; GFX8S-NEXT: s_or_b32 s1, s1, s5
-; GFX8S-NEXT: s_or_b32 s0, s0, s4
-; GFX8S-NEXT: v_mov_b32_e32 v0, s0
-; GFX8S-NEXT: v_mov_b32_e32 v1, s1
-; GFX8S-NEXT: v_mov_b32_e32 v2, s2
-; GFX8S-NEXT: v_mov_b32_e32 v3, s3
-; GFX8S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: or_v4i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -257,30 +195,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_or_i32:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_mov_b32 s7, 0xf000
-; GFX6S-NEXT: s_mov_b32 s6, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_mov_b32 s4, s0
-; GFX6S-NEXT: s_or_b32 s0, s2, s3
-; GFX6S-NEXT: s_mov_b32 s5, s1
-; GFX6S-NEXT: v_mov_b32_e32 v0, s0
-; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_or_i32:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_mov_b32 s7, 0xf000
-; GFX8S-NEXT: s_mov_b32 s6, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_mov_b32 s4, s0
-; GFX8S-NEXT: s_or_b32 s0, s2, s3
-; GFX8S-NEXT: s_mov_b32 s5, s1
-; GFX8S-NEXT: v_mov_b32_e32 v0, s0
-; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_or_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -334,32 +248,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: v_or_b32_e32 v0, s12, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: vector_or_i32:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_load_dword s4, s[4:5], 0xd
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dword s5, s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s5, s4
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: vector_or_i32:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_load_dword s4, s[4:5], 0x34
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dword s5, s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s5, s4
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: vector_or_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -406,28 +294,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_or_literal_i32:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dword s6, s[4:5], 0xb
-; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s6, 0x1869f
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_or_literal_i32:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s6, 0x1869f
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_or_literal_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -471,32 +337,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_or_literal_i64:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
-; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s7, 0xf237b
-; GFX6S-NEXT: s_or_b32 s5, s6, 0x3039
-; GFX6S-NEXT: v_mov_b32_e32 v0, s5
-; GFX6S-NEXT: v_mov_b32_e32 v1, s4
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_or_literal_i64:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
-; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s7, 0xf237b
-; GFX8S-NEXT: s_or_b32 s5, s6, 0x3039
-; GFX8S-NEXT: v_mov_b32_e32 v0, s5
-; GFX8S-NEXT: v_mov_b32_e32 v1, s4
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_or_literal_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -559,49 +399,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_or_literal_multi_use_i64:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
-; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d
-; GFX6S-NEXT: s_movk_i32 s8, 0x3039
-; GFX6S-NEXT: s_mov_b32 s9, 0xf237b
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX6S-NEXT: v_mov_b32_e32 v0, s6
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: v_mov_b32_e32 v1, s7
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_add_u32 s0, s4, 0x3039
-; GFX6S-NEXT: s_addc_u32 s1, s5, 0xf237b
-; GFX6S-NEXT: s_waitcnt expcnt(0)
-; GFX6S-NEXT: v_mov_b32_e32 v0, s0
-; GFX6S-NEXT: v_mov_b32_e32 v1, s1
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_waitcnt vmcnt(0)
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_or_literal_multi_use_i64:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
-; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74
-; GFX8S-NEXT: s_movk_i32 s8, 0x3039
-; GFX8S-NEXT: s_mov_b32 s9, 0xf237b
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX8S-NEXT: v_mov_b32_e32 v0, s6
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: v_mov_b32_e32 v1, s7
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_add_u32 s0, s4, 0x3039
-; GFX8S-NEXT: s_addc_u32 s1, s5, 0xf237b
-; GFX8S-NEXT: v_mov_b32_e32 v0, s0
-; GFX8S-NEXT: v_mov_b32_e32 v1, s1
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_waitcnt vmcnt(0)
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_or_literal_multi_use_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[]
@@ -658,30 +455,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_or_inline_imm_i64:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
-; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s6, 63
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: v_mov_b32_e32 v1, s7
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_or_inline_imm_i64:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
-; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s6, 63
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: v_mov_b32_e32 v1, s7
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_or_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -741,47 +514,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_or_inline_imm_multi_use_i64:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6S-NEXT: s_mov_b32 s7, 0xf000
-; GFX6S-NEXT: s_mov_b32 s6, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_mov_b32 s4, s0
-; GFX6S-NEXT: s_or_b32 s0, s2, 63
-; GFX6S-NEXT: s_mov_b32 s5, s1
-; GFX6S-NEXT: v_mov_b32_e32 v0, s0
-; GFX6S-NEXT: v_mov_b32_e32 v1, s3
-; GFX6S-NEXT: s_add_u32 s0, s8, 63
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX6S-NEXT: s_addc_u32 s1, s9, 0
-; GFX6S-NEXT: s_waitcnt expcnt(0)
-; GFX6S-NEXT: v_mov_b32_e32 v0, s0
-; GFX6S-NEXT: v_mov_b32_e32 v1, s1
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX6S-NEXT: s_waitcnt vmcnt(0)
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_or_inline_imm_multi_use_i64:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
-; GFX8S-NEXT: s_mov_b32 s7, 0xf000
-; GFX8S-NEXT: s_mov_b32 s6, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_mov_b32 s4, s0
-; GFX8S-NEXT: s_or_b32 s0, s2, 63
-; GFX8S-NEXT: s_mov_b32 s5, s1
-; GFX8S-NEXT: v_mov_b32_e32 v0, s0
-; GFX8S-NEXT: v_mov_b32_e32 v1, s3
-; GFX8S-NEXT: s_add_u32 s0, s8, 63
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8S-NEXT: s_addc_u32 s1, s9, 0
-; GFX8S-NEXT: v_mov_b32_e32 v0, s0
-; GFX8S-NEXT: v_mov_b32_e32 v1, s1
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8S-NEXT: s_waitcnt vmcnt(0)
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_or_inline_imm_multi_use_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[]
@@ -834,30 +566,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_or_neg_inline_imm_i64:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13
-; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: v_mov_b32_e32 v1, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s6, -8
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_or_neg_inline_imm_i64:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c
-; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: v_mov_b32_e32 v1, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s6, -8
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_or_neg_inline_imm_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -911,30 +619,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: vector_or_literal_i32:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s4, 0xffff
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: vector_or_literal_i32:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s4, 0xffff
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: vector_or_literal_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -993,30 +677,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
; GFX8-NEXT: v_or_b32_e32 v0, 4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: vector_or_inline_immediate_i32:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s4, 4
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: vector_or_inline_immediate_i32:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s4, 4
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: vector_or_inline_immediate_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1069,34 +729,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_or_i64:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6S-NEXT: s_mov_b32 s7, 0xf000
-; GFX6S-NEXT: s_mov_b32 s6, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_mov_b32 s4, s0
-; GFX6S-NEXT: s_mov_b32 s5, s1
-; GFX6S-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6S-NEXT: v_mov_b32_e32 v0, s0
-; GFX6S-NEXT: v_mov_b32_e32 v1, s1
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_or_i64:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
-; GFX8S-NEXT: s_mov_b32 s7, 0xf000
-; GFX8S-NEXT: s_mov_b32 s6, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_mov_b32 s4, s0
-; GFX8S-NEXT: s_mov_b32 s5, s1
-; GFX8S-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8S-NEXT: v_mov_b32_e32 v0, s0
-; GFX8S-NEXT: v_mov_b32_e32 v1, s1
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -1159,36 +791,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: vector_or_i64:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
-; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: v_mov_b32_e32 v1, s5
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: vector_or_i64:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
-; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: v_mov_b32_e32 v1, s5
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: vector_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -1255,34 +857,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: v_or_b32_e32 v1, s13, v1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: scalar_vector_or_i64:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: v_mov_b32_e32 v1, s5
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: scalar_vector_or_i64:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: v_mov_b32_e32 v1, s5
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: scalar_vector_or_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1344,34 +918,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: vector_or_i64_loadimm:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s5, s5, 0x146f
-; GFX6S-NEXT: s_or_b32 s4, s4, 0xdf77987f
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: v_mov_b32_e32 v1, s5
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: vector_or_i64_loadimm:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s5, s5, 0x146f
-; GFX8S-NEXT: s_or_b32 s4, s4, 0xdf77987f
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: v_mov_b32_e32 v1, s5
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: vector_or_i64_loadimm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1433,32 +979,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
; GFX8-NEXT: v_or_b32_e32 v0, 8, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: vector_or_i64_imm:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s4, 8
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: v_mov_b32_e32 v1, s5
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: vector_or_i64_imm:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s4, 8
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: v_mov_b32_e32 v1, s5
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: vector_or_i64_imm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1519,32 +1039,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
; GFX8-NEXT: v_or_b32_e32 v0, -8, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: vector_or_i64_neg_inline_imm:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: v_mov_b32_e32 v1, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s4, -8
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: vector_or_i64_neg_inline_imm:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: v_mov_b32_e32 v1, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s4, -8
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: vector_or_i64_neg_inline_imm:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1607,32 +1101,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: vector_or_i64_neg_literal:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: v_mov_b32_e32 v1, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s4, 0xffffff38
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: vector_or_i64_neg_literal:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: v_mov_b32_e32 v1, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s4, 0xffffff38
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: vector_or_i64_neg_literal:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1683,30 +1151,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: trunc_i64_or_to_i32:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13
-; GFX6S-NEXT: s_load_dword s7, s[4:5], 0x1d
-; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6S-NEXT: s_mov_b32 s3, 0xf000
-; GFX6S-NEXT: s_mov_b32 s2, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_or_b32 s4, s7, s6
-; GFX6S-NEXT: v_mov_b32_e32 v0, s4
-; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: trunc_i64_or_to_i32:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c
-; GFX8S-NEXT: s_load_dword s7, s[4:5], 0x74
-; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8S-NEXT: s_mov_b32 s3, 0xf000
-; GFX8S-NEXT: s_mov_b32 s2, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_or_b32 s4, s7, s6
-; GFX8S-NEXT: v_mov_b32_e32 v0, s4
-; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: trunc_i64_or_to_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -1777,44 +1221,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: or_i1:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6S-NEXT: s_mov_b32 s7, 0xf000
-; GFX6S-NEXT: s_mov_b32 s6, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_load_dword s8, s[8:9], 0x0
-; GFX6S-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX6S-NEXT: s_mov_b32 s4, s0
-; GFX6S-NEXT: s_mov_b32 s5, s1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: v_mul_f32_e64 v0, 1.0, s8
-; GFX6S-NEXT: v_mul_f32_e64 v1, 1.0, s2
-; GFX6S-NEXT: v_max_f32_e32 v0, v1, v0
-; GFX6S-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
-; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: or_i1:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8S-NEXT: s_mov_b32 s7, 0xf000
-; GFX8S-NEXT: s_mov_b32 s6, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_load_dword s8, s[8:9], 0x0
-; GFX8S-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8S-NEXT: s_mov_b32 s4, s0
-; GFX8S-NEXT: s_mov_b32 s5, s1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: v_mul_f32_e64 v0, 1.0, s8
-; GFX8S-NEXT: v_mul_f32_e64 v1, 1.0, s2
-; GFX8S-NEXT: v_max_f32_e32 v0, v1, v0
-; GFX8S-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
-; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: or_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -1877,36 +1283,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; GFX6S-LABEL: s_or_i1:
-; GFX6S: ; %bb.0:
-; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GFX6S-NEXT: s_mov_b32 s7, 0xf000
-; GFX6S-NEXT: s_mov_b32 s6, -1
-; GFX6S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6S-NEXT: s_cmp_eq_u32 s0, s1
-; GFX6S-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6S-NEXT: s_cmp_eq_u32 s2, s3
-; GFX6S-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6S-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX6S-NEXT: buffer_store_byte v0, off, s[4:7], 0
-; GFX6S-NEXT: s_endpgm
-; GFX8S-LABEL: s_or_i1:
-; GFX8S: ; %bb.0:
-; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
-; GFX8S-NEXT: s_mov_b32 s7, 0xf000
-; GFX8S-NEXT: s_mov_b32 s6, -1
-; GFX8S-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8S-NEXT: s_cmp_eq_u32 s0, s1
-; GFX8S-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX8S-NEXT: s_cmp_eq_u32 s2, s3
-; GFX8S-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX8S-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX8S-NEXT: buffer_store_byte v0, off, s[4:7], 0
-; GFX8S-NEXT: s_endpgm
; EG-LABEL: s_or_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
@@ -1935,4 +1311,3 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
store i1 %or, ptr addrspace(1) %out
ret void
}
-
>From d759ddae9b0f7b873e78e560f9f9343987cd998b Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Jun 2025 07:09:04 -0500
Subject: [PATCH 09/15] Apply reviewer comments to performFNegCombine
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index aa091801a218e..14f46e5aafcbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5213,15 +5213,13 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
SDValue Cond = N0.getOperand(0);
SDValue LHS = N0.getOperand(1);
SDValue RHS = N0.getOperand(2);
- EVT LHVT = LHS.getValueType();
- EVT RHVT = RHS.getValueType();
- // The regression was limited to i32 v2/i32.
- if (RHVT != MVT::i32 && LHVT != MVT::i32)
+ EVT VT = LHS.getValueType();
+ if (VT != MVT::i32)
return SDValue();
- SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
- SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
- SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
+ SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+ SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ SDValue Op = DAG.getNode(Opc, SL, VT, Cond, LFNeg, RFNeg);
return Op;
}
case ISD::BITCAST: {
>From bfb37e4a710cedcfe569b82d2201cc1d4e356665 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Jun 2025 07:12:41 -0500
Subject: [PATCH 10/15] Remove dead code
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 17 +----------------
1 file changed, 1 insertion(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 14f46e5aafcbd..a05d3cfb9e212 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5204,23 +5204,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
}
case ISD::SELECT: {
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
- // This combine became necessary recently to prevent a regression in
- // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
- // Specifically, additional instructions were added to the final codegen.
- // When adding this combine a case was added to performFNEGCombine to
- // prevent this combine from being undone under certain conditions.
// TODO: Invert conditions of foldFreeOpFromSelect
- SDValue Cond = N0.getOperand(0);
- SDValue LHS = N0.getOperand(1);
- SDValue RHS = N0.getOperand(2);
- EVT VT = LHS.getValueType();
- if (VT != MVT::i32)
- return SDValue();
-
- SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, VT, LHS);
- SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, VT, RHS);
- SDValue Op = DAG.getNode(Opc, SL, VT, Cond, LFNeg, RFNeg);
- return Op;
+ return SDValue();
}
case ISD::BITCAST: {
SDLoc SL(N);
>From 55a643da593a0d7ba04e4a9662ee344f5b960d78 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Jun 2025 10:36:20 -0500
Subject: [PATCH 11/15] Re-enstate r600 tests in independent files. This action
has already taken place for tens of other tests.
---
llvm/test/CodeGen/AMDGPU/and.ll | 659 +-----------------
llvm/test/CodeGen/AMDGPU/and.r600.ll | 987 +++++++++++++++++++++++++++
llvm/test/CodeGen/AMDGPU/or.ll | 350 ----------
llvm/test/CodeGen/AMDGPU/or.r600.ll | 515 ++++++++++++++
4 files changed, 1503 insertions(+), 1008 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/and.r600.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/or.r600.ll
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 4673df3183cfa..29bfc253e2e7e 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -2,6 +2,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
+
declare i32 @llvm.amdgcn.workitem.id.x() #0
define amdgpu_ps <2 x i32> @s_and_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
@@ -64,23 +65,6 @@ define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: test2:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: AND_INT * T0.Y, T0.Y, T0.W,
-; EG-NEXT: AND_INT T0.X, T0.X, T0.Z,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
%a = load <2 x i32>, ptr addrspace(1) %in
%b = load <2 x i32>, ptr addrspace(1) %b_ptr
@@ -127,26 +111,6 @@ define amdgpu_kernel void @test4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: test4:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
-; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT * T0.W, T0.W, T1.W,
-; EG-NEXT: AND_INT * T0.Z, T0.Z, T1.Z,
-; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y,
-; EG-NEXT: AND_INT T0.X, T0.X, T1.X,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
%a = load <4 x i32>, ptr addrspace(1) %in
%b = load <4 x i32>, ptr addrspace(1) %b_ptr
@@ -181,16 +145,6 @@ define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T1.X, KC0[2].Z, KC0[2].W,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i32 %a, %b
store i32 %and, ptr addrspace(1) %out, align 4
ret void
@@ -220,16 +174,6 @@ define amdgpu_kernel void @s_and_constant_i32(ptr addrspace(1) %out, i32 %a) {
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_constant_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T1.X, KC0[2].Z, literal.y,
-; EG-NEXT: 2(2.802597e-45), 1234567(1.729997e-39)
%and = and i32 %a, 1234567
store i32 %and, ptr addrspace(1) %out, align 4
ret void
@@ -277,19 +221,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_multi_use_constant_i32_0:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MOV T0.X, literal.x,
-; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
-; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T1.X, PV.W, KC0[2].W,
-; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i32 %a, 1234567
; Just to stop future replacement of copy to vgpr + store with VALU op.
@@ -333,19 +264,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_multi_use_constant_i32_1:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
-; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[2].W,
-; EG-NEXT: ADD_INT T0.X, PV.W, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45)
%and = and i32 %a, 1234567
%foo = add i32 %and, 1234567
%bar = add i32 %foo, %b
@@ -394,27 +312,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_vgpr(ptr addrspace(1) %out, ptr addrsp
; GFX8-NEXT: v_and_b32_e32 v2, v5, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_i32_vgpr_vgpr:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
-; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: AND_INT T0.X, T0.X, T1.X,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
%gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
@@ -462,25 +359,6 @@ define amdgpu_kernel void @v_and_i32_sgpr_vgpr(ptr addrspace(1) %out, i32 %a, pt
; GFX8-NEXT: v_and_b32_e32 v2, s2, v3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_i32_sgpr_vgpr:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.X, KC0[2].Z, T0.X,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -525,25 +403,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(ptr addrspace(1) %out, ptr addrsp
; GFX8-NEXT: v_and_b32_e32 v2, s4, v3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_i32_vgpr_sgpr:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.X, T0.X, KC0[2].W,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -588,24 +447,6 @@ define amdgpu_kernel void @v_and_constant_i32(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_constant_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
%a = load i32, ptr addrspace(1) %gep, align 4
@@ -649,24 +490,6 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_and_b32_e32 v0, 64, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_inline_imm_64_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
%a = load i32, ptr addrspace(1) %gep, align 4
@@ -710,24 +533,6 @@ define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_and_b32_e32 v0, -16, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_inline_imm_neg_16_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: -16(nan), 2(2.802597e-45)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
%a = load i32, ptr addrspace(1) %gep, align 4
@@ -766,17 +571,6 @@ define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, KC0[3].Z,
-; EG-NEXT: AND_INT * T0.X, KC0[2].W, KC0[3].Y,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, %b
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -810,33 +604,6 @@ define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) {
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_i1:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @10, KC0[], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3
-; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T0.X, 0.0,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T1.X, T0.X,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PS, 1,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T0.X, PV.W, PS,
-; EG-NEXT: LSHL * T0.W, literal.x, PS,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: MOV T0.Y, 0.0,
-; EG-NEXT: MOV * T0.Z, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i1 %a, %b
store i1 %and, ptr addrspace(1) %out
ret void
@@ -872,18 +639,6 @@ define amdgpu_kernel void @s_and_constant_i64(ptr addrspace(1) %out, i64 %a) {
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_constant_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 524288(7.346840e-40), 2(2.802597e-45)
%and = and i64 %a, 549756338176
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -935,26 +690,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_multi_use_constant_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 10, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: ALU clause starting at 6:
-; EG-NEXT: AND_INT T0.X, KC0[3].Y, literal.x,
-; EG-NEXT: AND_INT * T1.X, KC0[3].Z, literal.y,
-; EG-NEXT: 524288(7.346840e-40), 128(1.793662e-43)
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T2.X, PV.W, literal.x,
-; EG-NEXT: AND_INT * T3.X, KC0[3].X, literal.y,
-; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT * T5.X, KC0[2].W, literal.y,
-; EG-NEXT: 2(2.802597e-45), 524288(7.346840e-40)
%and0 = and i64 %a, 549756338176
%and1 = and i64 %b, 549756338176
store volatile i64 %and0, ptr addrspace(1) %out
@@ -988,17 +723,6 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_32_bit_constant_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
-; EG-NEXT: MOV T0.Y, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45)
%and = and i64 %a, 1234567
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -1058,33 +782,6 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_multi_use_inline_imm_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 17, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T5.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: ALU clause starting at 6:
-; EG-NEXT: LSHL T0.W, KC0[3].W, 1,
-; EG-NEXT: LSHL * T1.W, KC0[2].W, 1,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T0.X, PV.W, KC0[4].W,
-; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
-; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W,
-; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T1.X, KC0[5].X, PS,
-; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W,
-; EG-NEXT: ADD_INT T2.X, KC0[5].X, PV.W,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T3.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T4.X, T1.W, KC0[4].W,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl.a = shl i64 %a, 1
%shl.b = shl i64 %b, 1
%and0 = and i64 %shl.a, 62
@@ -1141,27 +838,6 @@ define amdgpu_kernel void @v_and_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX8-NEXT: v_and_b32_e32 v0, v0, v2
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
-; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
-; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y,
-; EG-NEXT: AND_INT T0.X, T0.X, T1.X,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%a = load i64, ptr addrspace(1) %gep.a, align 8
@@ -1209,26 +885,6 @@ define amdgpu_kernel void @v_and_constant_i64(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_and_b32_e32 v0, 0xab19b207, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_constant_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT * T0.Y, T0.Y, literal.x,
-; EG-NEXT: 286(4.007714e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%a = load i64, ptr addrspace(1) %gep.a, align 8
@@ -1289,46 +945,6 @@ define amdgpu_kernel void @v_and_multi_use_constant_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_multi_use_constant_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @12
-; EG-NEXT: ALU 0, @22, KC0[], KC1[]
-; EG-NEXT: TEX 0 @14
-; EG-NEXT: ALU 0, @23, KC0[], KC1[]
-; EG-NEXT: TEX 1 @16
-; EG-NEXT: ALU 10, @24, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T5.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T5.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1
-; EG-NEXT: Fetch clause starting at 14:
-; EG-NEXT: VTX_READ_32 T2.X, T2.X, 0, #1
-; EG-NEXT: Fetch clause starting at 16:
-; EG-NEXT: VTX_READ_32 T3.X, T3.X, 4, #1
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 20:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: MOV * T1.X, PV.X,
-; EG-NEXT: ALU clause starting at 22:
-; EG-NEXT: MOV * T2.X, T0.X,
-; EG-NEXT: ALU clause starting at 23:
-; EG-NEXT: MOV * T3.X, T0.X,
-; EG-NEXT: ALU clause starting at 24:
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: AND_INT * T3.X, T3.X, literal.y,
-; EG-NEXT: -1424379385(-5.460358e-13), 286(4.007714e-43)
-; EG-NEXT: AND_INT T2.X, T2.X, literal.x,
-; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
-; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45)
-; EG-NEXT: AND_INT T1.X, T1.X, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 286(4.007714e-43), 4(5.605194e-45)
-; EG-NEXT: LSHR * T5.X, PV.W, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%a = load volatile i64, ptr addrspace(1) %aptr
%b = load volatile i64, ptr addrspace(1) %aptr
%and0 = and i64 %a, 1231231234567
@@ -1390,44 +1006,6 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(ptr addrspace(1) %out,
; GFX8-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_multi_use_inline_imm_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @12
-; EG-NEXT: ALU 0, @22, KC0[], KC1[]
-; EG-NEXT: TEX 0 @14
-; EG-NEXT: ALU 0, @23, KC0[], KC1[]
-; EG-NEXT: TEX 1 @16
-; EG-NEXT: ALU 8, @24, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1
-; EG-NEXT: Fetch clause starting at 14:
-; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
-; EG-NEXT: Fetch clause starting at 16:
-; EG-NEXT: VTX_READ_32 T2.X, T2.X, 4, #1
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 20:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: MOV * T1.X, PV.X,
-; EG-NEXT: ALU clause starting at 22:
-; EG-NEXT: MOV * T1.X, T0.X,
-; EG-NEXT: ALU clause starting at 23:
-; EG-NEXT: MOV * T2.X, T0.X,
-; EG-NEXT: ALU clause starting at 24:
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: AND_INT * T1.X, T1.X, literal.x,
-; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
-; EG-NEXT: LSHR T3.X, PV.W, literal.x,
-; EG-NEXT: MOV * T4.X, literal.y,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%a = load volatile i64, ptr addrspace(1) %aptr
%b = load volatile i64, ptr addrspace(1) %aptr
%and0 = and i64 %a, 63
@@ -1473,25 +1051,6 @@ define amdgpu_kernel void @v_and_i64_32_bit_constant(ptr addrspace(1) %out, ptr
; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_i64_32_bit_constant:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: MOV T0.Y, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%a = load i64, ptr addrspace(1) %gep.a, align 8
@@ -1536,25 +1095,6 @@ define amdgpu_kernel void @v_and_inline_imm_i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: v_and_b32_e32 v0, 64, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_inline_imm_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: MOV T0.Y, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%a = load i64, ptr addrspace(1) %gep.a, align 8
@@ -1600,24 +1140,6 @@ define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_and_b32_e32 v0, -8, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: v_and_inline_neg_imm_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: -8(nan), 2(2.802597e-45)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%a = load i64, ptr addrspace(1) %gep.a, align 8
@@ -1652,17 +1174,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_64_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
-; EG-NEXT: MOV T0.Y, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45)
%and = and i64 %a, 64
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -1702,21 +1213,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %ou
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_64_i64_noshrink:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHL * T0.W, KC0[2].W, 1,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T0.X, PV.W, KC0[3].W,
-; EG-NEXT: ADDC_UINT T0.W, PV.W, KC0[3].W,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.Y, KC0[4].X, PV.W,
%shl = shl i64 %a, 1
%and = and i64 %shl, 64
%add = add i64 %and, %b
@@ -1750,17 +1246,6 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_1_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT T0.X, KC0[2].W, 1,
-; EG-NEXT: MOV T0.Y, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 1
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -1792,18 +1277,6 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_1.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: 1072693248(1.875000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 4607182418800017408
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -1835,18 +1308,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_neg_1.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: -1074790400(-1.875000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 13830554455654793216
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -1878,18 +1339,6 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_0.5_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: 1071644672(1.750000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 4602678819172646912
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -1921,18 +1370,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_neg_0.5_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: -1075838976(-1.750000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 13826050856027422720
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -1964,18 +1401,6 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_2.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 4611686018427387904
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -2007,18 +1432,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_neg_2.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: -1073741824(-2.000000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 13835058055282163712
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -2050,18 +1463,6 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_4.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: 1074790400(2.250000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 4616189618054758400
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -2093,18 +1494,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_neg_4.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: -1072693248(-2.250000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 13839561654909534208
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -2139,17 +1528,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, p
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_f32_4.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
-; EG-NEXT: MOV T0.Y, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 1082130432(4.000000e+00), 2(2.802597e-45)
%and = and i64 %a, 1082130432
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -2181,17 +1559,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %ou
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MOV * T0.Y, KC0[3].X,
-; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: -1065353216(-4.000000e+00), 2(2.802597e-45)
%and = and i64 %a, -1065353216
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -2225,18 +1592,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 4647714815446351872
store i64 %and, ptr addrspace(1) %out, align 8
ret void
@@ -2268,18 +1623,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
-; EG-NEXT: -1065353216(-4.000000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV T0.X, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and = and i64 %a, 13871086852301127680
store i64 %and, ptr addrspace(1) %out, align 8
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/and.r600.ll b/llvm/test/CodeGen/AMDGPU/and.r600.ll
new file mode 100644
index 0000000000000..590b1ac899fcf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/and.r600.ll
@@ -0,0 +1,987 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+
+
+define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; EG-LABEL: test2:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: AND_INT * T0.Y, T0.Y, T0.W,
+; EG-NEXT: AND_INT T0.X, T0.X, T0.Z,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+ %a = load <2 x i32>, ptr addrspace(1) %in
+ %b = load <2 x i32>, ptr addrspace(1) %b_ptr
+ %result = and <2 x i32> %a, %b
+ store <2 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; EG-LABEL: test4:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T0.W, T0.W, T1.W,
+; EG-NEXT: AND_INT * T0.Z, T0.Z, T1.Z,
+; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y,
+; EG-NEXT: AND_INT T0.X, T0.X, T1.X,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+ %a = load <4 x i32>, ptr addrspace(1) %in
+ %b = load <4 x i32>, ptr addrspace(1) %b_ptr
+ %result = and <4 x i32> %a, %b
+ store <4 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+; EG-LABEL: s_and_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[2].Z, KC0[2].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i32 %a, %b
+ store i32 %and, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @s_and_constant_i32(ptr addrspace(1) %out, i32 %a) {
+; EG-LABEL: s_and_constant_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[2].Z, literal.y,
+; EG-NEXT: 2(2.802597e-45), 1234567(1.729997e-39)
+ %and = and i32 %a, 1234567
+ store i32 %and, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; FIXME: We should really duplicate the constant so that the SALU use
+; can fold into the s_and_b32 and the VALU one is materialized
+; directly without copying from the SGPR.
+
+; Second use is a VGPR use of the constant.
+
+define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out, i32 %a, i32 %b) {
+; EG-LABEL: s_and_multi_use_constant_i32_0:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MOV T0.X, literal.x,
+; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.X, PV.W, KC0[2].W,
+; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i32 %a, 1234567
+
+ ; Just to stop future replacement of copy to vgpr + store with VALU op.
+ %foo = add i32 %and, %b
+ store volatile i32 %foo, ptr addrspace(1) %out
+ store volatile i32 1234567, ptr addrspace(1) %out
+ ret void
+}
+
+; Second use is another SGPR use of the constant.
+
+define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out, i32 %a, i32 %b) {
+; EG-LABEL: s_and_multi_use_constant_i32_1:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[2].W,
+; EG-NEXT: ADD_INT T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45)
+ %and = and i32 %a, 1234567
+ %foo = add i32 %and, 1234567
+ %bar = add i32 %foo, %b
+ store volatile i32 %bar, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_and_i32_vgpr_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; EG-LABEL: v_and_i32_vgpr_vgpr:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: AND_INT T0.X, T0.X, T1.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+ %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+ %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+ %a = load i32, ptr addrspace(1) %gep.a
+ %b = load i32, ptr addrspace(1) %gep.b
+ %and = and i32 %a, %b
+ store i32 %and, ptr addrspace(1) %gep.out
+ ret void
+}
+
+define amdgpu_kernel void @v_and_i32_sgpr_vgpr(ptr addrspace(1) %out, i32 %a, ptr addrspace(1) %bptr) {
+; EG-LABEL: v_and_i32_sgpr_vgpr:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.X, KC0[2].Z, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+ %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+ %b = load i32, ptr addrspace(1) %gep.b
+ %and = and i32 %a, %b
+ store i32 %and, ptr addrspace(1) %gep.out
+ ret void
+}
+
+define amdgpu_kernel void @v_and_i32_vgpr_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i32 %b) {
+; EG-LABEL: v_and_i32_vgpr_sgpr:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.X, T0.X, KC0[2].W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+ %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+ %a = load i32, ptr addrspace(1) %gep.a
+ %and = and i32 %a, %b
+ store i32 %and, ptr addrspace(1) %gep.out
+ ret void
+}
+
+define amdgpu_kernel void @v_and_constant_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_constant_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+ %a = load i32, ptr addrspace(1) %gep, align 4
+ %and = and i32 %a, 1234567
+ store i32 %and, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v_and_inline_imm_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_inline_imm_64_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+ %a = load i32, ptr addrspace(1) %gep, align 4
+ %and = and i32 %a, 64
+ store i32 %and, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_inline_imm_neg_16_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: -16(nan), 2(2.802597e-45)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+ %a = load i32, ptr addrspace(1) %gep, align 4
+ %and = and i32 %a, -16
+ store i32 %and, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+; EG-LABEL: s_and_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, KC0[3].Z,
+; EG-NEXT: AND_INT * T0.X, KC0[2].W, KC0[3].Y,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, %b
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) {
+; EG-LABEL: s_and_i1:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @10, KC0[], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T1.X, T0.X,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PS, 1,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i1 %a, %b
+ store i1 %and, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_and_constant_i64(ptr addrspace(1) %out, i64 %a) {
+; EG-LABEL: s_and_constant_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 524288(7.346840e-40), 2(2.802597e-45)
+ %and = and i64 %a, 549756338176
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+; EG-LABEL: s_and_multi_use_constant_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: ALU clause starting at 6:
+; EG-NEXT: AND_INT T0.X, KC0[3].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[3].Z, literal.y,
+; EG-NEXT: 524288(7.346840e-40), 128(1.793662e-43)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
+; EG-NEXT: AND_INT * T3.X, KC0[3].X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T5.X, KC0[2].W, literal.y,
+; EG-NEXT: 2(2.802597e-45), 524288(7.346840e-40)
+ %and0 = and i64 %a, 549756338176
+ %and1 = and i64 %b, 549756338176
+ store volatile i64 %and0, ptr addrspace(1) %out
+ store volatile i64 %and1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, i64 %a) {
+; EG-LABEL: s_and_32_bit_constant_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45)
+ %and = and i64 %a, 1234567
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
+; EG-LABEL: s_and_multi_use_inline_imm_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 17, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T5.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: ALU clause starting at 6:
+; EG-NEXT: LSHL T0.W, KC0[3].W, 1,
+; EG-NEXT: LSHL * T1.W, KC0[2].W, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.X, PV.W, KC0[4].W,
+; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W,
+; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.X, KC0[5].X, PS,
+; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W,
+; EG-NEXT: ADD_INT T2.X, KC0[5].X, PV.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T4.X, T1.W, KC0[4].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %shl.a = shl i64 %a, 1
+ %shl.b = shl i64 %b, 1
+ %and0 = and i64 %shl.a, 62
+ %and1 = and i64 %shl.b, 62
+ %add0 = add i64 %and0, %c
+ %add1 = add i64 %and1, %c
+ store volatile i64 %add0, ptr addrspace(1) %out
+ store volatile i64 %add1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_and_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; EG-LABEL: v_and_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y,
+; EG-NEXT: AND_INT T0.X, T0.X, T1.X,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+ %a = load i64, ptr addrspace(1) %gep.a, align 8
+ %gep.b = getelementptr i64, ptr addrspace(1) %bptr, i32 %tid
+ %b = load i64, ptr addrspace(1) %gep.b, align 8
+ %and = and i64 %a, %b
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @v_and_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_constant_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T0.Y, T0.Y, literal.x,
+; EG-NEXT: 286(4.007714e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+ %a = load i64, ptr addrspace(1) %gep.a, align 8
+ %and = and i64 %a, 1231231234567
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @v_and_multi_use_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_multi_use_constant_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @12
+; EG-NEXT: ALU 0, @22, KC0[], KC1[]
+; EG-NEXT: TEX 0 @14
+; EG-NEXT: ALU 0, @23, KC0[], KC1[]
+; EG-NEXT: TEX 1 @16
+; EG-NEXT: ALU 10, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1
+; EG-NEXT: Fetch clause starting at 14:
+; EG-NEXT: VTX_READ_32 T2.X, T2.X, 0, #1
+; EG-NEXT: Fetch clause starting at 16:
+; EG-NEXT: VTX_READ_32 T3.X, T3.X, 4, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 20:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, PV.X,
+; EG-NEXT: ALU clause starting at 22:
+; EG-NEXT: MOV * T2.X, T0.X,
+; EG-NEXT: ALU clause starting at 23:
+; EG-NEXT: MOV * T3.X, T0.X,
+; EG-NEXT: ALU clause starting at 24:
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: AND_INT * T3.X, T3.X, literal.y,
+; EG-NEXT: -1424379385(-5.460358e-13), 286(4.007714e-43)
+; EG-NEXT: AND_INT T2.X, T2.X, literal.x,
+; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
+; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45)
+; EG-NEXT: AND_INT T1.X, T1.X, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 286(4.007714e-43), 4(5.605194e-45)
+; EG-NEXT: LSHR * T5.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load volatile i64, ptr addrspace(1) %aptr
+ %b = load volatile i64, ptr addrspace(1) %aptr
+ %and0 = and i64 %a, 1231231234567
+ %and1 = and i64 %b, 1231231234567
+ store volatile i64 %and0, ptr addrspace(1) %out
+ store volatile i64 %and1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_multi_use_inline_imm_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @12
+; EG-NEXT: ALU 0, @22, KC0[], KC1[]
+; EG-NEXT: TEX 0 @14
+; EG-NEXT: ALU 0, @23, KC0[], KC1[]
+; EG-NEXT: TEX 1 @16
+; EG-NEXT: ALU 8, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1
+; EG-NEXT: Fetch clause starting at 14:
+; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
+; EG-NEXT: Fetch clause starting at 16:
+; EG-NEXT: VTX_READ_32 T2.X, T2.X, 4, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 20:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, PV.X,
+; EG-NEXT: ALU clause starting at 22:
+; EG-NEXT: MOV * T1.X, T0.X,
+; EG-NEXT: ALU clause starting at 23:
+; EG-NEXT: MOV * T2.X, T0.X,
+; EG-NEXT: ALU clause starting at 24:
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.X, T1.X, literal.x,
+; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; EG-NEXT: LSHR T3.X, PV.W, literal.x,
+; EG-NEXT: MOV * T4.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load volatile i64, ptr addrspace(1) %aptr
+ %b = load volatile i64, ptr addrspace(1) %aptr
+ %and0 = and i64 %a, 63
+ %and1 = and i64 %b, 63
+ store volatile i64 %and0, ptr addrspace(1) %out
+ store volatile i64 %and1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_and_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_i64_32_bit_constant:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+ %a = load i64, ptr addrspace(1) %gep.a, align 8
+ %and = and i64 %a, 1234567
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @v_and_inline_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_inline_imm_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+ %a = load i64, ptr addrspace(1) %gep.a, align 8
+ %and = and i64 %a, 64
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; FIXME: Should be able to reduce load width
+
+define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
+; EG-LABEL: v_and_inline_neg_imm_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: -8(nan), 2(2.802597e-45)
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+ %a = load i64, ptr addrspace(1) %gep.a, align 8
+ %and = and i64 %a, -8
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_64_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45)
+ %and = and i64 %a, 64
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a, i32, i64 %b) {
+; EG-LABEL: s_and_inline_imm_64_i64_noshrink:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHL * T0.W, KC0[2].W, 1,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.X, PV.W, KC0[3].W,
+; EG-NEXT: ADDC_UINT T0.W, PV.W, KC0[3].W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.Y, KC0[4].X, PV.W,
+ %shl = shl i64 %a, 1
+ %and = and i64 %shl, 64
+ %add = add i64 %and, %b
+ store i64 %add, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_1_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT T0.X, KC0[2].W, 1,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 1
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_1.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: 1072693248(1.875000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 4607182418800017408
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_neg_1.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: -1074790400(-1.875000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 13830554455654793216
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_0.5_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: 1071644672(1.750000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 4602678819172646912
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_neg_0.5_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: -1075838976(-1.750000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 13826050856027422720
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_2.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 4611686018427387904
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_neg_2.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: -1073741824(-2.000000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 13835058055282163712
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_4.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: 1074790400(2.250000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 4616189618054758400
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_neg_4.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: -1072693248(-2.250000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 13839561654909534208
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; Test with the 64-bit integer bitpattern for a 32-bit float in the
+; low 32-bits, which is not a valid 64-bit inline immmediate.
+
+define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_f32_4.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 1082130432(4.000000e+00), 2(2.802597e-45)
+ %and = and i64 %a, 1082130432
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MOV * T0.Y, KC0[3].X,
+; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: -1065353216(-4.000000e+00), 2(2.802597e-45)
+ %and = and i64 %a, -1065353216
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; Shift into upper 32-bits
+
+define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 4647714815446351872
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
+; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x,
+; EG-NEXT: -1065353216(-4.000000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %and = and i64 %a, 13871086852301127680
+ store i64 %and, ptr addrspace(1) %out, align 8
+ ret void
+}
+attributes #0 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 0da53f2a95953..728067edcf399 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -72,23 +72,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: or_v2i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT * T0.Y, T0.Y, T0.W,
-; EG-NEXT: OR_INT T0.X, T0.X, T0.Z,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
%a = load <2 x i32>, ptr addrspace(1) %in
%b = load <2 x i32>, ptr addrspace(1) %b_ptr
@@ -141,26 +124,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: or_v4i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
-; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: OR_INT * T0.W, T0.W, T1.W,
-; EG-NEXT: OR_INT * T0.Z, T0.Z, T1.Z,
-; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y,
-; EG-NEXT: OR_INT T0.X, T0.X, T1.X,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
%a = load <4 x i32>, ptr addrspace(1) %in
%b = load <4 x i32>, ptr addrspace(1) %b_ptr
@@ -195,16 +158,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_or_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T1.X, KC0[2].Z, KC0[2].W,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%or = or i32 %a, %b
store i32 %or, ptr addrspace(1) %out
ret void
@@ -248,22 +201,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: v_or_b32_e32 v0, s12, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: vector_or_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%loada = load i32, ptr addrspace(1) %a
%or = or i32 %loada, %b
store i32 %or, ptr addrspace(1) %out
@@ -294,16 +231,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_or_literal_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T1.X, KC0[2].Z, literal.y,
-; EG-NEXT: 2(2.802597e-45), 99999(1.401284e-40)
%or = or i32 %a, 99999
store i32 %or, ptr addrspace(1) %out, align 4
ret void
@@ -337,18 +264,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_or_literal_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: OR_INT * T0.Y, KC0[5].X, literal.x,
-; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
-; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45)
%or = or i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
ret void
@@ -399,28 +314,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_or_literal_multi_use_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 6:
-; EG-NEXT: ADDC_UINT * T0.W, KC0[7].Y, literal.x,
-; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T0.X, KC0[7].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[7].Z, PV.W,
-; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T1.X, PV.W, literal.x,
-; EG-NEXT: MOV * T2.X, literal.y,
-; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T3.Y, KC0[5].X, literal.x,
-; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
-; EG-NEXT: OR_INT T3.X, KC0[4].W, literal.x,
-; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
-; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45)
%or = or i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
@@ -455,17 +348,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_or_inline_imm_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MOV * T0.Y, KC0[5].X,
-; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45)
%or = or i64 %a, 63
store i64 %or, ptr addrspace(1) %out
ret void
@@ -514,25 +396,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_or_inline_imm_multi_use_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 6:
-; EG-NEXT: ADD_INT T0.X, KC0[3].Y, literal.x,
-; EG-NEXT: ADDC_UINT * T0.W, KC0[3].Y, literal.x,
-; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00)
-; EG-NEXT: ADD_INT T1.X, KC0[3].Z, PV.W,
-; EG-NEXT: MOV * T2.X, literal.x,
-; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
-; EG-NEXT: MOV * T3.Y, KC0[3].X,
-; EG-NEXT: OR_INT T3.X, KC0[2].W, literal.x,
-; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
-; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45)
%or = or i64 %a, 63
store i64 %or, ptr addrspace(1) %out
%foo = add i64 %b, 63
@@ -566,18 +429,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_or_neg_inline_imm_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: -8(nan), 2(2.802597e-45)
-; EG-NEXT: MOV * T0.Y, literal.x,
-; EG-NEXT: -1(nan), 0(0.000000e+00)
%or = or i64 %a, -8
store i64 %or, ptr addrspace(1) %out
ret void
@@ -619,22 +470,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: vector_or_literal_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
%loada = load i32, ptr addrspace(1) %a, align 4
%or = or i32 %loada, 65535
store i32 %or, ptr addrspace(1) %out, align 4
@@ -677,22 +512,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
; GFX8-NEXT: v_or_b32_e32 v0, 4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: vector_or_inline_immediate_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45)
%loada = load i32, ptr addrspace(1) %a, align 4
%or = or i32 %loada, 4
store i32 %or, ptr addrspace(1) %out, align 4
@@ -729,17 +548,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_or_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: OR_INT * T0.Y, KC0[3].X, KC0[3].Z,
-; EG-NEXT: OR_INT * T0.X, KC0[2].W, KC0[3].Y,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%or = or i64 %a, %b
store i64 %or, ptr addrspace(1) %out
ret void
@@ -791,25 +599,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: vector_or_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
-; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV T0.X, KC0[2].Z,
-; EG-NEXT: MOV * T1.X, KC0[2].W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y,
-; EG-NEXT: OR_INT T0.X, T0.X, T1.X,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%loada = load i64, ptr addrspace(1) %a, align 8
%loadb = load i64, ptr addrspace(1) %b, align 8
%or = or i64 %loada, %loadb
@@ -857,23 +646,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: v_or_b32_e32 v1, s13, v1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: scalar_vector_or_i64:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT * T0.Y, T0.Y, KC0[3].X,
-; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%loada = load i64, ptr addrspace(1) %a
%or = or i64 %loada, %b
store i64 %or, ptr addrspace(1) %out
@@ -918,24 +690,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: vector_or_i64_loadimm:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT * T0.Y, T0.Y, literal.x,
-; EG-NEXT: 5231(7.330192e-42), 0(0.000000e+00)
-; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45)
%loada = load i64, ptr addrspace(1) %a, align 8
%or = or i64 %loada, 22470723082367
store i64 %or, ptr addrspace(1) %out
@@ -979,22 +733,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
; GFX8-NEXT: v_or_b32_e32 v0, 8, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: vector_or_i64_imm:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%loada = load i64, ptr addrspace(1) %a, align 8
%or = or i64 %loada, 8
store i64 %or, ptr addrspace(1) %out
@@ -1039,24 +777,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
; GFX8-NEXT: v_or_b32_e32 v0, -8, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: vector_or_i64_neg_inline_imm:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: -8(nan), 2(2.802597e-45)
-; EG-NEXT: MOV * T0.Y, literal.x,
-; EG-NEXT: -1(nan), 0(0.000000e+00)
%loada = load i64, ptr addrspace(1) %a, align 8
%or = or i64 %loada, -8
store i64 %or, ptr addrspace(1) %out
@@ -1101,24 +821,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: vector_or_i64_neg_literal:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: -200(nan), 2(2.802597e-45)
-; EG-NEXT: MOV * T0.Y, literal.x,
-; EG-NEXT: -1(nan), 0(0.000000e+00)
%loada = load i64, ptr addrspace(1) %a, align 8
%or = or i64 %loada, -200
store i64 %or, ptr addrspace(1) %out
@@ -1151,16 +853,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: trunc_i64_or_to_i32:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.X, KC0[7].Y, KC0[4].W,
%add = or i64 %b, %a
%trunc = trunc i64 %add to i32
store i32 %trunc, ptr addrspace(1) %out, align 8
@@ -1221,26 +913,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: or_i1:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV T0.X, KC0[2].Z,
-; EG-NEXT: MOV * T1.X, KC0[2].W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X,
-; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0,
-; EG-NEXT: AND_INT T0.X, PV.W, 1,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%a = load float, ptr addrspace(1) %in0
%b = load float, ptr addrspace(1) %in1
%acmp = fcmp oge float %a, 0.000000e+00
@@ -1283,28 +955,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
-; EG-LABEL: s_or_i1:
-; EG: ; %bb.0:
-; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
-; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: SETE_INT T0.W, KC0[3].X, KC0[3].Y,
-; EG-NEXT: SETE_INT * T1.W, KC0[2].Z, KC0[2].W,
-; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PS, 1,
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T0.X, PV.W, PS,
-; EG-NEXT: LSHL * T0.W, literal.x, PS,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: MOV T0.Y, 0.0,
-; EG-NEXT: MOV * T0.Z, 0.0,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%cmp0 = icmp eq i32 %a, %b
%cmp1 = icmp eq i32 %c, %d
%or = or i1 %cmp0, %cmp1
diff --git a/llvm/test/CodeGen/AMDGPU/or.r600.ll b/llvm/test/CodeGen/AMDGPU/or.r600.ll
new file mode 100644
index 0000000000000..ed9d0085fd82a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/or.r600.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+
+
+define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; EG-LABEL: or_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT * T0.Y, T0.Y, T0.W,
+; EG-NEXT: OR_INT T0.X, T0.X, T0.Z,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+ %a = load <2 x i32>, ptr addrspace(1) %in
+ %b = load <2 x i32>, ptr addrspace(1) %b_ptr
+ %result = or <2 x i32> %a, %b
+ store <2 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; EG-LABEL: or_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: OR_INT * T0.W, T0.W, T1.W,
+; EG-NEXT: OR_INT * T0.Z, T0.Z, T1.Z,
+; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y,
+; EG-NEXT: OR_INT T0.X, T0.X, T1.X,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+ %a = load <4 x i32>, ptr addrspace(1) %in
+ %b = load <4 x i32>, ptr addrspace(1) %b_ptr
+ %result = or <4 x i32> %a, %b
+ store <4 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+; EG-LABEL: scalar_or_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T1.X, KC0[2].Z, KC0[2].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %or = or i32 %a, %b
+ store i32 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) {
+; EG-LABEL: vector_or_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %loada = load i32, ptr addrspace(1) %a
+ %or = or i32 %loada, %b
+ store i32 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) {
+; EG-LABEL: scalar_or_literal_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T1.X, KC0[2].Z, literal.y,
+; EG-NEXT: 2(2.802597e-45), 99999(1.401284e-40)
+ %or = or i32 %a, 99999
+ store i32 %or, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; EG-LABEL: scalar_or_literal_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: OR_INT * T0.Y, KC0[5].X, literal.x,
+; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45)
+ %or = or i64 %a, 4261135838621753
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
+; EG-LABEL: scalar_or_literal_multi_use_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 6:
+; EG-NEXT: ADDC_UINT * T0.W, KC0[7].Y, literal.x,
+; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T0.X, KC0[7].Y, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[7].Z, PV.W,
+; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.X, PV.W, literal.x,
+; EG-NEXT: MOV * T2.X, literal.y,
+; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T3.Y, KC0[5].X, literal.x,
+; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
+; EG-NEXT: OR_INT T3.X, KC0[4].W, literal.x,
+; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
+; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45)
+ %or = or i64 %a, 4261135838621753
+ store i64 %or, ptr addrspace(1) %out
+
+ %foo = add i64 %b, 4261135838621753
+ store volatile i64 %foo, ptr addrspace(1) poison
+ ret void
+}
+
+define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; EG-LABEL: scalar_or_inline_imm_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MOV * T0.Y, KC0[5].X,
+; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45)
+ %or = or i64 %a, 63
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+; EG-LABEL: scalar_or_inline_imm_multi_use_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 6:
+; EG-NEXT: ADD_INT T0.X, KC0[3].Y, literal.x,
+; EG-NEXT: ADDC_UINT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T1.X, KC0[3].Z, PV.W,
+; EG-NEXT: MOV * T2.X, literal.x,
+; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV * T3.Y, KC0[3].X,
+; EG-NEXT: OR_INT T3.X, KC0[2].W, literal.x,
+; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
+; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45)
+ %or = or i64 %a, 63
+ store i64 %or, ptr addrspace(1) %out
+ %foo = add i64 %b, 63
+ store volatile i64 %foo, ptr addrspace(1) poison
+ ret void
+}
+
+define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; EG-LABEL: scalar_or_neg_inline_imm_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: -8(nan), 2(2.802597e-45)
+; EG-NEXT: MOV * T0.Y, literal.x,
+; EG-NEXT: -1(nan), 0(0.000000e+00)
+ %or = or i64 %a, -8
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; EG-LABEL: vector_or_literal_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
+ %loada = load i32, ptr addrspace(1) %a, align 4
+ %or = or i32 %loada, 65535
+ store i32 %or, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; EG-LABEL: vector_or_inline_immediate_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45)
+ %loada = load i32, ptr addrspace(1) %a, align 4
+ %or = or i32 %loada, 4
+ store i32 %or, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+; EG-LABEL: scalar_or_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: OR_INT * T0.Y, KC0[3].X, KC0[3].Z,
+; EG-NEXT: OR_INT * T0.X, KC0[2].W, KC0[3].Y,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %or = or i64 %a, %b
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; EG-LABEL: vector_or_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV T0.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, KC0[2].W,
+; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y,
+; EG-NEXT: OR_INT T0.X, T0.X, T1.X,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %loadb = load i64, ptr addrspace(1) %b, align 8
+ %or = or i64 %loada, %loadb
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) {
+; EG-LABEL: scalar_vector_or_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT * T0.Y, T0.Y, KC0[3].X,
+; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %loada = load i64, ptr addrspace(1) %a
+ %or = or i64 %loada, %b
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; EG-LABEL: vector_or_i64_loadimm:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT * T0.Y, T0.Y, literal.x,
+; EG-NEXT: 5231(7.330192e-42), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45)
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %or = or i64 %loada, 22470723082367
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+; FIXME: The or 0 should really be removed.
+define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; EG-LABEL: vector_or_i64_imm:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %or = or i64 %loada, 8
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; EG-LABEL: vector_or_i64_neg_inline_imm:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: -8(nan), 2(2.802597e-45)
+; EG-NEXT: MOV * T0.Y, literal.x,
+; EG-NEXT: -1(nan), 0(0.000000e+00)
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %or = or i64 %loada, -8
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; EG-LABEL: vector_or_i64_neg_literal:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: -200(nan), 2(2.802597e-45)
+; EG-NEXT: MOV * T0.Y, literal.x,
+; EG-NEXT: -1(nan), 0(0.000000e+00)
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %or = or i64 %loada, -200
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
+; EG-LABEL: trunc_i64_or_to_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T1.X, KC0[7].Y, KC0[4].W,
+ %add = or i64 %b, %a
+ %trunc = trunc i64 %add to i32
+ store i32 %trunc, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; EG-LABEL: or_i1:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV T0.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, KC0[2].W,
+; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X,
+; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0,
+; EG-NEXT: AND_INT T0.X, PV.W, 1,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load float, ptr addrspace(1) %in0
+ %b = load float, ptr addrspace(1) %in1
+ %acmp = fcmp oge float %a, 0.000000e+00
+ %bcmp = fcmp oge float %b, 0.000000e+00
+ %or = or i1 %acmp, %bcmp
+ %result = zext i1 %or to i32
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+; EG-LABEL: s_or_i1:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: SETE_INT T0.W, KC0[3].X, KC0[3].Y,
+; EG-NEXT: SETE_INT * T1.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, 1,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %cmp0 = icmp eq i32 %a, %b
+ %cmp1 = icmp eq i32 %c, %d
+ %or = or i1 %cmp0, %cmp1
+ store i1 %or, ptr addrspace(1) %out
+ ret void
+}
>From 12f29b7408224964752d185bca09755ca72b30fe Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Wed, 25 Jun 2025 09:38:06 -0500
Subject: [PATCH 12/15] Remove unhelpful commentary.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 --------
1 file changed, 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a05d3cfb9e212..584aa30e31f9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4056,10 +4056,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
- // When the shl64_reduce optimisation code is passed through vector
- // legalization some scalarising occurs. After ISD::AND was legalised, this
- // resulted in the AND instructions no longer being elided, as mentioned
- // below. The following code should make sure this takes place.
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue VAND = RHS.getOperand(0);
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
@@ -4234,10 +4230,6 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
SDLoc SL(N);
unsigned RHSVal;
- // When the shl64_reduce optimisation code is passed through vector
- // legalization some scalarising occurs. After ISD::AND was legalised, this
- // resulted in the AND instructions no longer being elided, as mentioned
- // below. The following code should make sure this takes place.
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue VAND = RHS.getOperand(0);
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
>From b3e6e22fc9b00cf3b9f54f944fd9d2a0100487b5 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Wed, 25 Jun 2025 09:45:13 -0500
Subject: [PATCH 13/15] Remove unnecessary driveby clang-format
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 41 +++++++++++++-----------
1 file changed, 23 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1b93046b5d3e6..1835e540cfe21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2477,25 +2477,30 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
- def : GCNPat<(rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0, (EXTRACT_SUBREG $src1, lo16),
- /* clamp */ 0, /* op_sel */ 0)>;
+def : GCNPat <
+ (rotr i32:$src0, i32:$src1),
+ (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
+ /* src1_modifiers */ 0, $src0,
+ /* src2_modifiers */ 0,
+ (EXTRACT_SUBREG $src1, lo16),
+ /* clamp */ 0, /* op_sel */ 0)
+>;
- def : GCNPat<
- (i32(trunc(srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
- (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
- (i32(EXTRACT_SUBREG(i64 $src0), sub1)), 0, /* src1_modifiers */
- (i32(EXTRACT_SUBREG(i64 $src0), sub0)), 0, /* src2_modifiers */
- (i16(EXTRACT_SUBREG VGPR_32:$src1, lo16)),
- /* clamp */ 0, /* op_sel */ 0)>;
-
- def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
- (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src1,
- /* src2_modifiers */ 0, (EXTRACT_SUBREG VGPR_32:$src2, lo16),
- /* clamp */ 0, /* op_sel */ 0)>;
+def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+ (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
+ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+ 0, /* src1_modifiers */
+ (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
+ 0, /* src2_modifiers */
+ (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)),
+ /* clamp */ 0, /* op_sel */ 0)>;
+
+def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+ (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
+ /* src1_modifiers */ 0, $src1,
+ /* src2_modifiers */ 0,
+ (EXTRACT_SUBREG VGPR_32:$src2, lo16),
+ /* clamp */ 0, /* op_sel */ 0)>;
} // end True16Predicate = UseRealTrue16Insts
let True16Predicate = UseFakeTrue16Insts in {
>From eb851bf15218a0edfdbd653158a81cdb56419f6c Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Wed, 25 Jun 2025 09:48:58 -0500
Subject: [PATCH 14/15] Remove dead checks in xor.ll
---
llvm/test/CodeGen/AMDGPU/xor.ll | 572 --------------------------------
1 file changed, 572 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index d7e780a5ddf74..feb6ecd996516 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -76,36 +76,6 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
-; SIS-LABEL: xor_v2i32:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s5
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: xor_v2i32:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
-; VIS-NEXT: v_mov_b32_e32 v3, s1
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %in0
%b = load <2 x i32>, ptr addrspace(1) %in1
@@ -160,46 +130,6 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_xor_b32_e32 v0, v0, v4
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; VI-NEXT: s_endpgm
-; SIS-LABEL: xor_v4i32:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; SIS-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b32 s7, s7, s11
-; SIS-NEXT: s_xor_b32 s6, s6, s10
-; SIS-NEXT: s_xor_b32 s5, s5, s9
-; SIS-NEXT: s_xor_b32 s4, s4, s8
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s5
-; SIS-NEXT: v_mov_b32_e32 v2, s6
-; SIS-NEXT: v_mov_b32_e32 v3, s7
-; SIS-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: xor_v4i32:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VIS-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; VIS-NEXT: v_mov_b32_e32 v4, s0
-; VIS-NEXT: v_mov_b32_e32 v5, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b32 s0, s7, s11
-; VIS-NEXT: s_xor_b32 s1, s6, s10
-; VIS-NEXT: s_xor_b32 s2, s5, s9
-; VIS-NEXT: s_xor_b32 s3, s4, s8
-; VIS-NEXT: v_mov_b32_e32 v0, s3
-; VIS-NEXT: v_mov_b32_e32 v1, s2
-; VIS-NEXT: v_mov_b32_e32 v2, s1
-; VIS-NEXT: v_mov_b32_e32 v3, s0
-; VIS-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VIS-NEXT: s_endpgm
%a = load <4 x i32>, ptr addrspace(1) %in0
%b = load <4 x i32>, ptr addrspace(1) %in1
%result = xor <4 x i32> %a, %b
@@ -255,44 +185,6 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-; SIS-LABEL: xor_i1:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SIS-NEXT: s_mov_b32 s7, 0xf000
-; SIS-NEXT: s_mov_b32 s6, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dword s8, s[2:3], 0x0
-; SIS-NEXT: s_load_dword s9, s[4:5], 0x0
-; SIS-NEXT: s_mov_b32 s4, s0
-; SIS-NEXT: s_mov_b32 s5, s1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: v_cmp_ge_f32_e64 s[0:1], s8, 0
-; SIS-NEXT: v_cmp_ge_f32_e64 s[2:3], s9, 1.0
-; SIS-NEXT: v_mov_b32_e32 v0, s9
-; SIS-NEXT: v_mov_b32_e32 v1, s8
-; SIS-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
-; SIS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: xor_i1:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dword s6, s[2:3], 0x0
-; VIS-NEXT: s_load_dword s4, s[4:5], 0x0
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: v_cmp_ge_f32_e64 s[0:1], s6, 0
-; VIS-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, 1.0
-; VIS-NEXT: v_mov_b32_e32 v2, s4
-; VIS-NEXT: v_mov_b32_e32 v3, s6
-; VIS-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
-; VIS-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; VIS-NEXT: flat_store_dword v[0:1], v2
-; VIS-NEXT: s_endpgm
%a = load float, ptr addrspace(1) %in0
%b = load float, ptr addrspace(1) %in1
@@ -348,48 +240,6 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
; VI-NEXT: v_and_b32_e32 v2, 1, v2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
-; SIS-LABEL: v_xor_i1:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SIS-NEXT: s_mov_b32 s7, 0xf000
-; SIS-NEXT: s_mov_b32 s6, -1
-; SIS-NEXT: s_mov_b32 s14, s6
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_mov_b32 s12, s2
-; SIS-NEXT: s_mov_b32 s13, s3
-; SIS-NEXT: s_mov_b32 s15, s7
-; SIS-NEXT: s_mov_b32 s10, s6
-; SIS-NEXT: s_mov_b32 s11, s7
-; SIS-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc
-; SIS-NEXT: s_waitcnt vmcnt(0)
-; SIS-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc
-; SIS-NEXT: s_waitcnt vmcnt(0)
-; SIS-NEXT: s_mov_b32 s4, s0
-; SIS-NEXT: s_mov_b32 s5, s1
-; SIS-NEXT: v_xor_b32_e32 v0, v0, v1
-; SIS-NEXT: v_and_b32_e32 v0, 1, v0
-; SIS-NEXT: buffer_store_byte v0, off, s[4:7], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: v_xor_i1:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: v_mov_b32_e32 v0, s2
-; VIS-NEXT: v_mov_b32_e32 v1, s3
-; VIS-NEXT: v_mov_b32_e32 v2, s4
-; VIS-NEXT: v_mov_b32_e32 v3, s5
-; VIS-NEXT: flat_load_ubyte v4, v[0:1] glc
-; VIS-NEXT: s_waitcnt vmcnt(0)
-; VIS-NEXT: flat_load_ubyte v2, v[2:3] glc
-; VIS-NEXT: s_waitcnt vmcnt(0)
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: v_xor_b32_e32 v2, v4, v2
-; VIS-NEXT: v_and_b32_e32 v2, 1, v2
-; VIS-NEXT: flat_store_byte v[0:1], v2
-; VIS-NEXT: s_endpgm
%a = load volatile i1, ptr addrspace(1) %in0
%b = load volatile i1, ptr addrspace(1) %in1
%xor = xor i1 %a, %b
@@ -437,34 +287,6 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_xor_b32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-; SIS-LABEL: vector_xor_i32:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dword s6, s[2:3], 0x0
-; SIS-NEXT: s_load_dword s4, s[4:5], 0x0
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b32 s4, s6, s4
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: vector_xor_i32:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dword s2, s[2:3], 0x0
-; VIS-NEXT: s_load_dword s3, s[4:5], 0x0
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b32 s0, s2, s3
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dword v[0:1], v2
-; VIS-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in0
%b = load i32, ptr addrspace(1) %in1
%result = xor i32 %a, %b
@@ -496,28 +318,6 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-; SIS-LABEL: scalar_xor_i32:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_mov_b32 s7, 0xf000
-; SIS-NEXT: s_mov_b32 s6, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_mov_b32 s4, s0
-; SIS-NEXT: s_xor_b32 s0, s2, s3
-; SIS-NEXT: s_mov_b32 s5, s1
-; SIS-NEXT: v_mov_b32_e32 v0, s0
-; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: scalar_xor_i32:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b32 s2, s2, s3
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: v_mov_b32_e32 v2, s2
-; VIS-NEXT: flat_store_dword v[0:1], v2
-; VIS-NEXT: s_endpgm
%result = xor i32 %a, %b
store i32 %result, ptr addrspace(1) %out
ret void
@@ -547,28 +347,6 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-; SIS-LABEL: scalar_not_i32:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dword s6, s[4:5], 0xb
-; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_not_b32 s4, s6
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: scalar_not_i32:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dword s2, s[4:5], 0x2c
-; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_not_b32 s2, s2
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: v_mov_b32_e32 v2, s2
-; VIS-NEXT: flat_store_dword v[0:1], v2
-; VIS-NEXT: s_endpgm
%result = xor i32 %a, -1
store i32 %result, ptr addrspace(1) %out
ret void
@@ -606,30 +384,6 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
-; SIS-LABEL: vector_not_i32:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dword s4, s[2:3], 0x0
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_not_b32 s4, s4
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: vector_not_i32:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dword s2, s[2:3], 0x0
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_not_b32 s0, s2
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dword v[0:1], v2
-; VIS-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in0
%b = load i32, ptr addrspace(1) %in1
%result = xor i32 %a, -1
@@ -679,36 +433,6 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
-; SIS-LABEL: vector_xor_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s5
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: vector_xor_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
-; VIS-NEXT: v_mov_b32_e32 v3, s1
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in0
%b = load i64, ptr addrspace(1) %in1
%result = xor i64 %a, %b
@@ -744,32 +468,6 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
-; SIS-LABEL: scalar_xor_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SIS-NEXT: s_mov_b32 s7, 0xf000
-; SIS-NEXT: s_mov_b32 s6, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_mov_b32 s4, s0
-; SIS-NEXT: s_mov_b32 s5, s1
-; SIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9]
-; SIS-NEXT: v_mov_b32_e32 v0, s0
-; SIS-NEXT: v_mov_b32_e32 v1, s1
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: scalar_xor_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
-; VIS-NEXT: v_mov_b32_e32 v3, s1
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
%result = xor i64 %a, %b
store i64 %result, ptr addrspace(1) %out
ret void
@@ -801,30 +499,6 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
-; SIS-LABEL: scalar_not_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_mov_b32 s7, 0xf000
-; SIS-NEXT: s_mov_b32 s6, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_mov_b32 s4, s0
-; SIS-NEXT: s_mov_b32 s5, s1
-; SIS-NEXT: s_not_b64 s[0:1], s[2:3]
-; SIS-NEXT: v_mov_b32_e32 v0, s0
-; SIS-NEXT: v_mov_b32_e32 v1, s1
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: scalar_not_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_not_b64 s[0:1], s[2:3]
-; VIS-NEXT: v_mov_b32_e32 v3, s1
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
%result = xor i64 %a, -1
store i64 %result, ptr addrspace(1) %out
ret void
@@ -864,32 +538,6 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_not_b32_e32 v1, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-; SIS-LABEL: vector_not_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_not_b64 s[4:5], s[4:5]
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s5
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: vector_not_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_not_b64 s[0:1], s[2:3]
-; VIS-NEXT: v_mov_b32_e32 v3, s1
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in0
%b = load i64, ptr addrspace(1) %in1
%result = xor i64 %a, -1
@@ -956,57 +604,6 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
; VI-NEXT: .LBB14_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
; VI-NEXT: s_branch .LBB14_2
-; SIS-LABEL: xor_cf:
-; SIS: ; %bb.0: ; %entry
-; SIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
-; SIS-NEXT: s_mov_b64 s[10:11], 0
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0
-; SIS-NEXT: s_and_b64 vcc, exec, s[8:9]
-; SIS-NEXT: s_cbranch_vccz .LBB12_4
-; SIS-NEXT: ; %bb.1: ; %else
-; SIS-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
-; SIS-NEXT: s_andn2_b64 vcc, exec, s[10:11]
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_mov_b64 vcc, vcc
-; SIS-NEXT: s_cbranch_vccnz .LBB12_3
-; SIS-NEXT: .LBB12_2: ; %if
-; SIS-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
-; SIS-NEXT: .LBB12_3: ; %endif
-; SIS-NEXT: v_mov_b32_e32 v0, s8
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: v_mov_b32_e32 v1, s9
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; SIS-NEXT: .LBB12_4:
-; SIS-NEXT: ; implicit-def: $sgpr8_sgpr9
-; SIS-NEXT: s_branch .LBB12_2
-; VIS-LABEL: xor_cf:
-; VIS: ; %bb.0: ; %entry
-; VIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; VIS-NEXT: s_mov_b64 s[8:9], 0
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_cmp_lg_u64 s[4:5], 0
-; VIS-NEXT: s_cbranch_scc0 .LBB12_4
-; VIS-NEXT: ; %bb.1: ; %else
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VIS-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; VIS-NEXT: s_cbranch_vccnz .LBB12_3
-; VIS-NEXT: .LBB12_2: ; %if
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
-; VIS-NEXT: .LBB12_3: ; %endif
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: v_mov_b32_e32 v2, s2
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: v_mov_b32_e32 v3, s3
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
-; VIS-NEXT: .LBB12_4:
-; VIS-NEXT: ; implicit-def: $sgpr2_sgpr3
-; VIS-NEXT: s_branch .LBB12_2
entry:
%0 = icmp eq i64 %a, 0
br i1 %0, label %if, label %else
@@ -1053,32 +650,6 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-; SIS-LABEL: scalar_xor_literal_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b32 s4, s7, 0xf237b
-; SIS-NEXT: s_xor_b32 s5, s6, 0x3039
-; SIS-NEXT: v_mov_b32_e32 v0, s5
-; SIS-NEXT: v_mov_b32_e32 v1, s4
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: scalar_xor_literal_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b32 s1, s1, 0xf237b
-; VIS-NEXT: s_xor_b32 s0, s0, 0x3039
-; VIS-NEXT: v_mov_b32_e32 v2, s2
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: v_mov_b32_e32 v3, s3
-; VIS-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; VIS-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
ret void
@@ -1127,47 +698,6 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
-; SIS-LABEL: scalar_xor_literal_multi_use_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SIS-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x13
-; SIS-NEXT: s_movk_i32 s8, 0x3039
-; SIS-NEXT: s_mov_b32 s9, 0xf237b
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s5
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_add_u32 s0, s6, 0x3039
-; SIS-NEXT: s_addc_u32 s1, s7, 0xf237b
-; SIS-NEXT: s_waitcnt expcnt(0)
-; SIS-NEXT: v_mov_b32_e32 v0, s0
-; SIS-NEXT: v_mov_b32_e32 v1, s1
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_waitcnt vmcnt(0)
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: scalar_xor_literal_multi_use_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c
-; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
-; VIS-NEXT: s_movk_i32 s6, 0x3039
-; VIS-NEXT: s_mov_b32 s7, 0xf237b
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; VIS-NEXT: v_mov_b32_e32 v0, s4
-; VIS-NEXT: v_mov_b32_e32 v3, s1
-; VIS-NEXT: v_mov_b32_e32 v1, s5
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: s_add_u32 s0, s2, 0x3039
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_addc_u32 s1, s3, 0xf237b
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
-; VIS-NEXT: s_waitcnt vmcnt(0)
-; VIS-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
@@ -1202,30 +732,6 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-; SIS-LABEL: scalar_xor_inline_imm_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b32 s4, s6, 63
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s7
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: scalar_xor_inline_imm_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b32 s0, s0, 63
-; VIS-NEXT: v_mov_b32_e32 v2, s2
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v3, s3
-; VIS-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; VIS-NEXT: s_endpgm
%or = xor i64 %a, 63
store i64 %or, ptr addrspace(1) %out
ret void
@@ -1257,30 +763,6 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
-; SIS-LABEL: scalar_xor_neg_inline_imm_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], -8
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s5
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: scalar_xor_neg_inline_imm_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b64 s[0:1], s[0:1], -8
-; VIS-NEXT: v_mov_b32_e32 v0, s2
-; VIS-NEXT: v_mov_b32_e32 v3, s1
-; VIS-NEXT: v_mov_b32_e32 v1, s3
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
%or = xor i64 %a, -8
store i64 %or, ptr addrspace(1) %out
@@ -1321,32 +803,6 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out,
; VI-NEXT: v_xor_b32_e32 v1, -1, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-; SIS-LABEL: vector_xor_i64_neg_inline_imm:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b64 s[4:5], s[4:5], -8
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s5
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: vector_xor_i64_neg_inline_imm:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], -8
-; VIS-NEXT: v_mov_b32_e32 v3, s1
-; VIS-NEXT: v_mov_b32_e32 v2, s0
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
%loada = load i64, ptr addrspace(1) %a, align 8
%or = xor i64 %loada, -8
store i64 %or, ptr addrspace(1) %out
@@ -1387,34 +843,6 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
-; SIS-LABEL: vector_xor_literal_i64:
-; SIS: ; %bb.0:
-; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SIS-NEXT: s_mov_b32 s3, 0xf000
-; SIS-NEXT: s_mov_b32 s2, -1
-; SIS-NEXT: s_waitcnt lgkmcnt(0)
-; SIS-NEXT: s_xor_b32 s5, s5, 0x146f
-; SIS-NEXT: s_xor_b32 s4, s4, 0xdf77987f
-; SIS-NEXT: v_mov_b32_e32 v0, s4
-; SIS-NEXT: v_mov_b32_e32 v1, s5
-; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SIS-NEXT: s_endpgm
-; VIS-LABEL: vector_xor_literal_i64:
-; VIS: ; %bb.0:
-; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VIS-NEXT: v_mov_b32_e32 v0, s0
-; VIS-NEXT: v_mov_b32_e32 v1, s1
-; VIS-NEXT: s_waitcnt lgkmcnt(0)
-; VIS-NEXT: s_xor_b32 s0, s3, 0x146f
-; VIS-NEXT: s_xor_b32 s1, s2, 0xdf77987f
-; VIS-NEXT: v_mov_b32_e32 v2, s1
-; VIS-NEXT: v_mov_b32_e32 v3, s0
-; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; VIS-NEXT: s_endpgm
%loada = load i64, ptr addrspace(1) %a, align 8
%or = xor i64 %loada, 22470723082367
>From fdc5c33d204a53d51f0cf8cf6d88aa633470f51d Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Wed, 25 Jun 2025 10:25:34 -0500
Subject: [PATCH 15/15] Remove unnnecessary node duplication
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 584aa30e31f9a..8a42aa6839657 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -735,7 +735,7 @@ static bool hasSourceMods(const SDNode *N) {
case ISD::INLINEASM:
case ISD::INLINEASM_BR:
case AMDGPUISD::DIV_SCALE:
- case ISD::INTRINSIC_W_CHAIN:
+ case ISD::INTRINSIC_W_CHAIN:
// TODO: Should really be looking at the users of the bitcast. These are
// problematic because bitcasts are used to legalize all stores to integer
@@ -4086,9 +4086,10 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
LHSAND, Zero);
SDValue Hi =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
- SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
- SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
- SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
+ SDValue LoAnd =
+ DAG.getNode(ISD::AND, SL, MVT::i32, Lo, RHSAND->getOperand(0));
+ SDValue HiAnd =
+ DAG.getNode(ISD::AND, SL, MVT::i32, Hi, RHSAND->getOperand(0));
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
if (AndIndex == 0 || AndIndex == 1)
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
More information about the llvm-commits
mailing list