[llvm] [AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions (PR #140694)
Chris Jackson via llvm-commits
llvm-commits at lists.llvm.org
Wed May 28 02:46:46 PDT 2025
https://github.com/chrisjbris updated https://github.com/llvm/llvm-project/pull/140694
>From 995e7fb4b9cfdf4735729f6be44ad0b8e7f5aedb Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 20 May 2025 05:14:36 -0500
Subject: [PATCH] [AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make
use of 64-bit wide instructions
Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these
causes a number of test regressions, so extra work in the combiner and
Tablegen patterns was necessary.
- Use custom for v2i32 rotr instead of additional patterns. Modify
PerformOrCombine() to remove some identity or operations
- Fix rotr regression by adding lowerRotr() on the legalizer codepath
- Add test case to rotr.ll
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 105 +++++++++-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 17 +-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 15 ++
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 8 +-
llvm/test/CodeGen/AMDGPU/and.ll | 3 +-
llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 24 +--
llvm/test/CodeGen/AMDGPU/bfi_int.ll | 4 +-
.../AMDGPU/copysign-simplify-demanded-bits.ll | 2 +-
.../AMDGPU/dag-preserve-disjoint-flag.ll | 36 +++-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fshr.ll | 188 +++++-------------
llvm/test/CodeGen/AMDGPU/or.ll | 4 +-
llvm/test/CodeGen/AMDGPU/rotr.ll | 128 ++++++++++++
.../CodeGen/AMDGPU/vector_range_metadata.ll | 8 +-
llvm/test/CodeGen/AMDGPU/xor.ll | 4 +-
16 files changed, 376 insertions(+), 176 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ba7e11a853347..083a62638ee09 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -430,6 +430,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
}
+ setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
+ // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
+ // instead lower to cndmask in SITargetLowering::LowerSELECT().
+ setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
+ // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
+ // alignbit.
+ setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
+
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);
@@ -5929,6 +5937,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}
+// Enable lowering of ROTR for vxi32 types. This is a workaround for a
+// regression caused by legalising v2i32 or.
+SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(Opc == ISD::ROTR && "Expected ROTR Opcode for lowerROTR.");
+
+ assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
+ VT == MVT::v16i32) &&
+ "Unexpected ValueType.");
+
+ return DAG.UnrollVectorOp(Op.getNode());
+}
+
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
// wider vector type is legal.
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6115,6 +6137,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerGET_FPENV(Op, DAG);
case ISD::SET_FPENV:
return lowerSET_FPENV(Op, DAG);
+ case ISD::ROTR:
+ return lowerROTR(Op, DAG);
}
return SDValue();
}
@@ -12872,6 +12896,53 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
}
+ // Detect identity v2i32 OR and replace with identity source node.
+ // Specifically an Or that has operands constructed from the same source node
+ // via extract_vector_elt and build_vector. I.E.
+ // v2i32 or(
+ // v2i32 build_vector(
+ // i32 extract_elt(%IdentitySrc, 0),
+ // i32 0
+ // ),
+ // v2i32 build_vector(
+ // i32 0,
+ // i32 extract_elt(%IdentitySrc, 1)
+ // )
+ // )
+ // =>
+ // v2i32 %IdentitySrc
+ if (VT == MVT::v2i32) {
+ if (LHS->getOpcode() == ISD::BUILD_VECTOR &&
+ RHS->getOpcode() == ISD::BUILD_VECTOR) {
+ LLVM_DEBUG(dbgs() << "### Performing v2i32 SIISelLowering "
+ "DAGCombine::CombineOR\n";);
+
+ if (auto *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1)))
+ if (auto *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0))) {
+
+ // Test for and normalise build vectors.
+ if (LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
+
+ // Get the extract_vector_element operands.
+ SDValue LEVE = LHS->getOperand(0);
+ SDValue REVE = RHS->getOperand(1);
+
+ if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ // Check that different elements from the same vector are
+ // extracted.
+ if (LEVE->getOperand(0) == REVE->getOperand(0) &&
+ LEVE->getOperand(1) != REVE->getOperand(1)) {
+ LLVM_DEBUG(dbgs() << "### Found identity OR, folding...\n";);
+ SDValue IdentitySrc = LEVE.getOperand(0);
+ return IdentitySrc;
+ }
+ }
+ }
+ }
+ }
+ }
+
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
return SDValue();
@@ -12916,13 +12987,43 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
return RV;
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
+
+ const ConstantSDNode *CRHS_0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
+ const ConstantSDNode *CRHS_1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ SDValue LHS_0 = LHS.getOperand(0);
+ SDValue LHS_1 = LHS.getOperand(1);
+
+ if (LHS.getOpcode() == ISD::VSELECT && VT == MVT::v2i32) {
+ if (CRHS_0 && CRHS_0->getAPIntValue().isSignMask() &&
+ shouldFoldFNegIntoSrc(N, LHS_0))
+ if (CRHS_1 && CRHS_1->getAPIntValue().isSignMask() &&
+ shouldFoldFNegIntoSrc(N, LHS_1)) {
+ SDLoc DL(N);
+ SDValue CastLHS =
+ DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
+ SDValue CastRHS =
+ DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
+ SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
+ SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
+ SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
+ LHS->getOperand(0), FNegLHS, FNegRHS);
+ return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
+ }
+ }
+ // Possibly split vector here if one side does have a constant RHS.
+ }
+
+ // Add test for when only one of the RHS vector elements is a const. Might be
+ // possible to optimise this case.
+
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
- SelectionDAG &DAG = DCI.DAG;
- EVT VT = N->getValueType(0);
if (CRHS && VT == MVT::i64) {
if (SDValue Split =
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..a1fdae384502a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -366,6 +366,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
+ // bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
+ // EVT VT) const override;
+
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const override;
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
@@ -437,6 +440,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2e2913d88cc54..dd053703be74e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2334,9 +2334,9 @@ def : AMDGPUPatIgnoreCopies <
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
>;
-// 64-bit version
+foreach vt = [i64, v2i32] in {
def : AMDGPUPatIgnoreCopies <
- (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+ (DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
(REG_SEQUENCE VReg_64,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
@@ -2345,6 +2345,7 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
+}
def : AMDGPUPat <
(fcopysign f32:$src0, f32:$src1),
@@ -2375,9 +2376,14 @@ def : AMDGPUPat <
$src1), sub1)
>;
+def : AMDGPUPat <
+ (fneg (select i1:$src0, (f32 (bitconvert i32:$src1)), (f32 (bitconvert i32:$src2)))),
+ (V_CNDMASK_B32_e64 (i32 1), $src2, (i32 1), $src1, $src0)>;
+
let True16Predicate = NotHasTrue16BitInsts in {
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
+
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -2388,6 +2394,12 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
+
+// Prevents regression in fneg-modifier-casting.ll along with modifications to XorCombine() when v2i32 or is legal.
+def : AMDGPUPat <
+ (fneg (select i1:$src0, (f32 (bitconvert i32:$src1)), (f32 (bitconvert i32:$src2)))),
+ (V_CNDMASK_B32_e64 (i32 1), $src2, (i32 1), $src1, $src0)>;
+
def : GCNPat <
(rotr i32:$src0, i32:$src1),
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
@@ -2449,6 +2461,7 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
>;
} // end True16Predicate = UseFakeTrue16Insts
+
/********** ====================== **********/
/********** Indirect addressing **********/
/********** ====================== **********/
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 40b3dfb94ce2f..f2e1a27644afb 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1779,6 +1779,21 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;
+def : GCNPat <
+ (v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
+ (S_AND_B64 SReg_64:$x, SReg_64:$y)
+>;
+
+def : GCNPat <
+ (v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
+ (S_OR_B64 SReg_64:$x, SReg_64:$y)
+>;
+
+def : GCNPat <
+ (v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
+ (S_XOR_B64 SReg_64:$x, SReg_64:$y)
+>;
+
// Same as a 32-bit inreg
def : GCNPat<
(i32 (UniformUnaryFrag<sext> i16:$src)),
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..efa9c465f794e 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -954,9 +954,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
-class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
+class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
GCNPat<
- (DivergentBinFrag<Op> i64:$src0, i64:$src1),
+ (DivergentBinFrag<Op> vt:$src0, vt:$src1),
(REG_SEQUENCE VReg_64,
(Inst
(i32 (EXTRACT_SUBREG $src0, sub0)),
@@ -973,6 +973,10 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
+def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
+def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
+def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;
+
// mul24 w/ 64 bit output.
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
(i64 (Op i32:$src0, i32:$src1)),
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index c6233642110ea..05402b3c89409 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -8,8 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
-; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b64
define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..ca8f7736f6093 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -151,25 +151,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
; GFX-950: ; %bb.0:
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
+; GFX-950-NEXT: v_and_b32_e32 v4, 1, v6
+; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
-; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
-; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
+; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5]
+; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1]
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
+; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
-; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
+; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
+; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
-; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
-; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
+; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3]
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
-; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0
+; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
; GFX-950-NEXT: ; return to shader part epilog
%res = fptrunc <2 x double> %src to <2 x bfloat>
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 201b97d479c68..d76ecbd73fe6e 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -582,15 +582,15 @@ define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %
; GFX7-LABEL: v_bitselect_v2i32_pat1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
+; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bitselect_v2i32_pat1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
+; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bitselect_v2i32_pat1:
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..2d73f17d74d8b 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -31,8 +31,8 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
; GFX9-LABEL: test_pown_reduced_fast_v2f16_known_odd:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0
diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
index d63a36c4b2958..7e2e8b577e085 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
@@ -28,12 +28,15 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
- ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_1]]
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
@@ -64,10 +67,23 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
- ; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY5]], killed [[COPY4]], implicit $exec
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY7]], killed [[COPY6]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; CHECK-NEXT: $vgpr1 = COPY [[COPY9]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
%result = or disjoint <2 x i32> %a, %b
ret <2 x i32> %result
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 924378eb2376d..b6f5cad48883d 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1759,4 +1759,4 @@ bb5: ; preds = %bb, %.entry
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #0
-attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 4a79096442c96..7afd99ddb0ef6 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -2010,61 +2010,61 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; SI-LABEL: v_fshr_v2i24:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; SI-NEXT: s_mov_b32 s4, 0xaaaaaab
-; SI-NEXT: v_mul_hi_u32 v6, v6, s4
-; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
+; SI-NEXT: v_mul_hi_u32 v6, v4, s4
+; SI-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
-; SI-NEXT: v_mul_hi_u32 v6, v7, s4
+; SI-NEXT: v_mul_hi_u32 v6, v5, s4
; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4
; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v6
-; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; SI-NEXT: v_mul_u32_u24_e32 v2, 24, v6
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2
+; SI-NEXT: v_alignbit_b32 v1, v1, v3, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v2i24:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
+; VI-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; VI-NEXT: s_mov_b32 s4, 0xaaaaaab
-; VI-NEXT: v_mul_hi_u32 v6, v6, s4
-; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
+; VI-NEXT: v_mul_hi_u32 v6, v4, s4
+; VI-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6
-; VI-NEXT: v_mul_hi_u32 v6, v7, s4
+; VI-NEXT: v_mul_hi_u32 v6, v5, s4
; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4
; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v6
-; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3
-; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3
-; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; VI-NEXT: v_mul_u32_u24_e32 v2, 24, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v5, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2
+; VI-NEXT: v_alignbit_b32 v1, v1, v3, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v2i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab
-; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5
+; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4
+; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6
-; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4
+; GFX9-NEXT: v_mul_hi_u32 v6, v5, s4
; GFX9-NEXT: v_add_u32_e32 v4, 8, v4
; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v6
-; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3
-; GFX9-NEXT: v_add_u32_e32 v3, 8, v3
-; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; GFX9-NEXT: v_mul_u32_u24_e32 v2, 24, v6
+; GFX9-NEXT: v_sub_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
+; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v2i24:
@@ -2075,12 +2075,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX10-LABEL: v_fshr_v2i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
+; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5
; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7
; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6
@@ -2091,109 +2091,29 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: v_fshr_v2i24:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6
-; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
-; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_fshr_v2i24:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX11-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6
-; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
-; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: v_fshr_v2i24:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6
-; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
-; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l
-; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: v_fshr_v2i24:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
-; GFX12-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6
-; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
-; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
-; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5
-; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_fshr_v2i24:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6
+; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6
+; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4
+; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
ret <2 x i24> %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index cc9650b9a7309..6fac3494a8ced 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -18,8 +18,8 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
@@ -37,8 +37,8 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6e361d6e297e..7322e2f239ee8 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -228,6 +228,134 @@ entry:
ret void
}
+define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i32> %y) {
+; R600-LABEL: rotr_v8i32:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X,
+; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W,
+; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z,
+; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X,
+; R600-NEXT: BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W,
+; R600-NEXT: BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z,
+; R600-NEXT: BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y,
+; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; SI-LABEL: rotr_v8i32:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s19
+; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; SI-NEXT: v_mov_b32_e32 v0, s18
+; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
+; SI-NEXT: v_mov_b32_e32 v0, s17
+; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
+; SI-NEXT: v_mov_b32_e32 v0, s16
+; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT: v_mov_b32_e32 v4, s23
+; SI-NEXT: v_alignbit_b32 v7, s15, s15, v4
+; SI-NEXT: v_mov_b32_e32 v4, s22
+; SI-NEXT: v_alignbit_b32 v6, s14, s14, v4
+; SI-NEXT: v_mov_b32_e32 v4, s21
+; SI-NEXT: v_alignbit_b32 v5, s13, s13, v4
+; SI-NEXT: v_mov_b32_e32 v4, s20
+; SI-NEXT: v_alignbit_b32 v4, s12, s12, v4
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX8-LABEL: rotr_v8i32:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s18
+; GFX8-NEXT: v_mov_b32_e32 v4, s17
+; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1
+; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, s23
+; GFX8-NEXT: v_alignbit_b32 v7, s15, s15, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, s22
+; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_alignbit_b32 v6, s14, s14, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, s21
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_alignbit_b32 v5, s13, s13, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
+; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s19
+; GFX8-NEXT: v_alignbit_b32 v4, s12, s12, v4
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s16
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: rotr_v8i32:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v7, s15, s15, s23
+; GFX10-NEXT: v_alignbit_b32 v6, s14, s14, s22
+; GFX10-NEXT: v_alignbit_b32 v5, s13, s13, s21
+; GFX10-NEXT: v_alignbit_b32 v4, s12, s12, s20
+; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s19
+; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s18
+; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s17
+; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s16
+; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: rotr_v8i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v7, s15, s15, s23
+; GFX11-NEXT: v_alignbit_b32 v6, s14, s14, s22
+; GFX11-NEXT: v_alignbit_b32 v5, s13, s13, s21
+; GFX11-NEXT: v_alignbit_b32 v4, s12, s12, s20
+; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s19
+; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s18
+; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s17
+; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s16
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
+entry:
+ %tmp0 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %y
+ %tmp1 = shl <8 x i32> %x, %tmp0
+ %tmp2 = lshr <8 x i32> %x, %y
+ %tmp3 = or <8 x i32> %tmp1, %tmp2
+ store <8 x i32> %tmp3, ptr addrspace(1) %in
+ ret void
+}
+
declare i16 @llvm.fshr.i16(i16, i16, i16)
define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
index d496634ae474f..8af4a8de7b266 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
@@ -18,11 +18,11 @@ define <2 x i32> @test_add2x32(ptr %a_ptr, ptr %b_ptr) {
; CHECK-LABEL: test_add2x32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dword v4, v[2:3]
-; CHECK-NEXT: flat_load_dword v5, v[0:1]
-; CHECK-NEXT: v_mov_b32_e32 v1, 48
+; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_or_b32_e32 v0, v5, v4
+; CHECK-NEXT: v_or_b32_e32 v1, v5, v7
+; CHECK-NEXT: v_or_b32_e32 v0, v4, v6
; CHECK-NEXT: s_setpc_b64 s[30:31]
%a = load <2 x i32>, ptr %a_ptr, !range !2, !noundef !{}
%b = load <2 x i32>, ptr %b_ptr, !range !3, !noundef !{}
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 00bb7b24786f5..26562adc908cd 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v1, v3, v1
; SI-NEXT: v_xor_b32_e32 v0, v2, v0
+; SI-NEXT: v_xor_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -40,8 +40,8 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: v_xor_b32_e32 v0, v0, v2
+; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %in0
More information about the llvm-commits
mailing list