[llvm] a521774 - DAG: Use poison for unused shuffle operands in legalizer (#177578)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 23 09:21:01 PST 2026
Author: Matt Arsenault
Date: 2026-01-23T18:20:56+01:00
New Revision: a521774217437ef87576ff275c80a92fd6e25cc0
URL: https://github.com/llvm/llvm-project/commit/a521774217437ef87576ff275c80a92fd6e25cc0
DIFF: https://github.com/llvm/llvm-project/commit/a521774217437ef87576ff275c80a92fd6e25cc0.diff
LOG: DAG: Use poison for unused shuffle operands in legalizer (#177578)
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 09d79ae208a8f..d9a2409b35e4c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1981,7 +1981,7 @@ ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
if (IntermedVals.size() > 1)
Vec2 = IntermedVals[1].first;
else if (Phase)
- Vec2 = DAG.getUNDEF(VT);
+ Vec2 = DAG.getPOISON(VT);
SmallVector<int, 16> ShuffleVec(NumElems, -1);
for (unsigned i = 0, e = IntermedVals[0].second.size(); i != e; ++i)
@@ -2097,7 +2097,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
if (Value2.getNode())
Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
else
- Vec2 = DAG.getUNDEF(VT);
+ Vec2 = DAG.getPOISON(VT);
// Return shuffle(LowValVec, undef, <0,0,0,0>)
return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8f6eddefa57ac..81184f709bd8c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1480,7 +1480,7 @@ SDValue VectorLegalizer::ExpandANY_EXTEND_VECTOR_INREG(SDNode *Node) {
return DAG.getNode(
ISD::BITCAST, DL, VT,
- DAG.getVectorShuffle(SrcVT, DL, Src, DAG.getUNDEF(SrcVT), ShuffleMask));
+ DAG.getVectorShuffle(SrcVT, DL, Src, DAG.getPOISON(SrcVT), ShuffleMask));
}
SDValue VectorLegalizer::ExpandSIGN_EXTEND_VECTOR_INREG(SDNode *Node) {
@@ -1565,7 +1565,8 @@ SDValue VectorLegalizer::ExpandBSWAP(SDNode *Node) {
if (TLI.isShuffleMaskLegal(ShuffleMask, ByteVT)) {
SDLoc DL(Node);
SDValue Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Node->getOperand(0));
- Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT), ShuffleMask);
+ Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getPOISON(ByteVT),
+ ShuffleMask);
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
@@ -1609,7 +1610,7 @@ SDValue VectorLegalizer::ExpandBITREVERSE(SDNode *Node) {
TLI.isOperationLegalOrCustomOrPromote(ISD::OR, ByteVT)))) {
SDLoc DL(Node);
SDValue Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Node->getOperand(0));
- Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
+ Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getPOISON(ByteVT),
BSWAPMask);
Op = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Op);
Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index c8b91751af159..e1e6252fd8abc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1949,7 +1949,7 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
SmallVector<int, 8> SplitHi(InNumElements, -1);
for (unsigned i = 0; i != OutNumElements; ++i)
SplitHi[i] = i + OutNumElements;
- InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);
+ InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getPOISON(InLoVT), SplitHi);
Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
@@ -3235,14 +3235,14 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
processShuffleMasks(
Mask, std::size(Inputs), std::size(Inputs),
/*NumOfUsedRegs=*/1,
- [&Output, &DAG = DAG, NewVT]() { Output = DAG.getUNDEF(NewVT); },
+ [&Output, &DAG = DAG, NewVT]() { Output = DAG.getPOISON(NewVT); },
[&Output, &DAG = DAG, NewVT, &DL, &Inputs,
&BuildVector](ArrayRef<int> Mask, unsigned Idx, unsigned /*Unused*/) {
if (Inputs[Idx]->getOpcode() == ISD::BUILD_VECTOR)
Output = BuildVector(Inputs[Idx], Inputs[Idx], Mask);
else
Output = DAG.getVectorShuffle(NewVT, DL, Inputs[Idx],
- DAG.getUNDEF(NewVT), Mask);
+ DAG.getPOISON(NewVT), Mask);
Inputs[Idx] = Output;
},
[&AccumulateResults, &Output, &DAG = DAG, NewVT, &DL, &Inputs,
@@ -7050,7 +7050,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_REVERSE(SDNode *N) {
SmallVector<int, 16> Mask(WidenNumElts, -1);
std::iota(Mask.begin(), Mask.begin() + VTNumElts, IdxVal);
- return DAG.getVectorShuffle(WidenVT, dl, ReverseVal, DAG.getUNDEF(WidenVT),
+ return DAG.getVectorShuffle(WidenVT, dl, ReverseVal, DAG.getPOISON(WidenVT),
Mask);
}
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
index 73a3dba77a497..53506b530bd71 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
@@ -283,7 +283,7 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_or_v4i8:
@@ -304,7 +304,7 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_or_v4i8:
@@ -325,7 +325,7 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
; GFX9-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_or_v4i8:
@@ -347,7 +347,7 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
; GFX10-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_or_v4i8:
@@ -368,7 +368,7 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0xff00, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v4i8:
@@ -378,7 +378,7 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_vector_reduce_or_v4i8:
@@ -405,7 +405,7 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0xff00, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v4i8:
@@ -419,7 +419,7 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: test_vector_reduce_or_v4i8:
@@ -454,7 +454,7 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_or_v8i8:
@@ -483,7 +483,7 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_or_v8i8:
@@ -512,7 +512,7 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
; GFX9-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_or_v8i8:
@@ -541,7 +541,7 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
; GFX10-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_or_v8i8:
@@ -570,7 +570,7 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0xff00, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v8i8:
@@ -585,7 +585,7 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_vector_reduce_or_v8i8:
@@ -620,7 +620,7 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0xff00, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v8i8:
@@ -639,7 +639,7 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: test_vector_reduce_or_v8i8:
@@ -685,7 +685,7 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_or_v16i8:
@@ -730,7 +730,7 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_or_v16i8:
@@ -775,7 +775,7 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
; GFX9-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_or_v16i8:
@@ -817,7 +817,7 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
; GFX10-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_or_v16i8:
@@ -860,7 +860,7 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0xff00, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v16i8:
@@ -884,7 +884,7 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_vector_reduce_or_v16i8:
@@ -934,7 +934,7 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0xff00, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v16i8:
@@ -962,7 +962,7 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0xffffff00, v0
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: test_vector_reduce_or_v16i8:
@@ -1001,7 +1001,7 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_or_v2i16:
@@ -1211,7 +1211,7 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) {
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_or_v4i16:
@@ -1358,7 +1358,7 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_or_v8i16:
@@ -1540,7 +1540,7 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_or_v16i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index b9b9e0848333c..4cc1109d60d21 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -320,7 +320,7 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX8-SDAG-NEXT: v_max_u16_sdwa v1, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-SDAG-NEXT: v_max_u16_e32 v0, v0, v1
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_umax_v4i8:
@@ -351,9 +351,8 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_max_u16_sdwa v1, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v2, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -388,9 +387,9 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-SDAG-NEXT: v_max_u16 v1, v1, v3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 8
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v2, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -426,13 +425,12 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v0.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -447,8 +445,8 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v3
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -497,13 +495,12 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v0.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -522,8 +519,8 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v3
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v1
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -629,7 +626,7 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX8-SDAG-NEXT: v_max_u16_sdwa v1, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_u16_e32 v0, v0, v2
; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-SDAG-NEXT: v_max_u16_e32 v0, v0, v1
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_umax_v8i8:
@@ -671,7 +668,7 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max_u16_e32 v0, v0, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_umax_v8i8:
@@ -707,17 +704,17 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX10-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX10-SDAG-NEXT: v_max_u16 v1, v1, v5
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-SDAG-NEXT: v_max_u16 v0, v0, v4
; GFX10-SDAG-NEXT: v_max3_u16 v1, v1, v3, v7
-; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX10-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v6
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v6
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 8
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max_u16 v0, v0, v3
-; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v2, v4
-; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v2, v3
+; GFX10-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SDAG-NEXT: v_max_u16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -762,21 +759,20 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v1.l, v3.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.h, v1.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v0.h, v3.l, v3.h
; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v3
-; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v2.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v8i8:
@@ -796,10 +792,10 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v4
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -857,21 +853,20 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v1.l, v3.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.h, v1.l, v1.h
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v0.h, v3.l, v3.h
; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v3
-; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v2.l, v1.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v8i8:
@@ -895,10 +890,10 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v3
-; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v4
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v1
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -1050,8 +1045,9 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX8-SDAG-NEXT: v_max_u16_e32 v0, v0, v2
; GFX8-SDAG-NEXT: v_max_u16_sdwa v1, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 8
+; GFX8-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_umax_v16i8:
@@ -1107,8 +1103,9 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 8
+; GFX9-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_umax_v16i8:
@@ -1176,10 +1173,10 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v2, v5
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-SDAG-NEXT: v_max_u16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 8
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v0
+; GFX10-SDAG-NEXT: v_lshrrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-SDAG-NEXT: v_max_u16 v0, v2, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_umax_v16i8:
@@ -1263,13 +1260,13 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v16i8:
@@ -1306,10 +1303,10 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1406,13 +1403,13 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v16i8:
@@ -1453,10 +1450,10 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v1
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 7cddebdca5cca..7047ee53efd80 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1705,22 +1705,22 @@ define void @splat2_v4i64_load_store(ptr %s, ptr %d) nounwind {
define void @splat4_v8f32_load_store(ptr %s, ptr %d) nounwind {
; AVX1-LABEL: splat4_v8f32_load_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0
-; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2
-; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3
-; AVX1-NEXT: vbroadcastss (%rdi), %xmm4
-; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5
-; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6
-; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7
-; AVX1-NEXT: vmovups %xmm7, 48(%rsi)
-; AVX1-NEXT: vmovups %xmm6, 32(%rsi)
-; AVX1-NEXT: vmovups %xmm5, 16(%rsi)
-; AVX1-NEXT: vmovups %xmm4, (%rsi)
-; AVX1-NEXT: vmovups %xmm3, 112(%rsi)
-; AVX1-NEXT: vmovups %xmm2, 96(%rsi)
-; AVX1-NEXT: vmovups %xmm1, 80(%rsi)
-; AVX1-NEXT: vmovups %xmm0, 64(%rsi)
+; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
+; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm2
+; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm3
+; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm4
+; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm5
+; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm6
+; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm7
+; AVX1-NEXT: vmovups %xmm7, 112(%rsi)
+; AVX1-NEXT: vmovups %xmm6, 96(%rsi)
+; AVX1-NEXT: vmovups %xmm5, 80(%rsi)
+; AVX1-NEXT: vmovups %xmm4, 64(%rsi)
+; AVX1-NEXT: vmovups %xmm3, 48(%rsi)
+; AVX1-NEXT: vmovups %xmm2, 32(%rsi)
+; AVX1-NEXT: vmovups %xmm1, 16(%rsi)
+; AVX1-NEXT: vmovups %xmm0, (%rsi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat4_v8f32_load_store:
More information about the llvm-commits
mailing list