[llvm] [LLVM][SelectionDAG] Add poison/undef folds for signed/unsigned max/min. (PR #149334)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 17 08:20:51 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Paul Walker (paulwalker-arm)
<details>
<summary>Changes</summary>
https://godbolt.org/z/nKG5d7rsG shows the IR equivalent.
---
Patch is 183.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149334.diff
6 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+27-4)
- (modified) llvm/test/CodeGen/AArch64/min-max-combine.ll (+167)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll (+291-293)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll (+291-293)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll (+273-280)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll (+235-235)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 682d93d0abf3f..4d300e61150a9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7875,13 +7875,22 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
// Fold a bunch of operators when the RHS is undef.
if (N2.isUndef()) {
- switch (Opcode) {
- case ISD::XOR:
- if (N1.isUndef())
+ if (N1.isUndef()) {
+ switch (Opcode) {
+ case ISD::XOR:
// Handle undef ^ undef -> 0 special case. This is a common
// idiom (misuse).
return getConstant(0, DL, VT);
- [[fallthrough]];
+ case ISD::SMAX:
+ case ISD::SMIN:
+ case ISD::UMAX:
+ case ISD::UMIN:
+ return N2.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT);
+ }
+ }
+
+ switch (Opcode) {
+ case ISD::XOR:
case ISD::ADD:
case ISD::PTRADD:
case ISD::SUB:
@@ -7895,16 +7904,30 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
case ISD::AND:
case ISD::SSUBSAT:
case ISD::USUBSAT:
+ case ISD::UMIN:
// fold op(arg1, undef) -> 0, fold op(arg1, poison) -> poison.
return N2.getOpcode() == ISD::POISON ? getPOISON(VT)
: getConstant(0, DL, VT);
case ISD::OR:
case ISD::SADDSAT:
case ISD::UADDSAT:
+ case ISD::UMAX:
// fold op(arg1, undef) -> an all-ones constant, fold op(arg1, poison) ->
// poison.
return N2.getOpcode() == ISD::POISON ? getPOISON(VT)
: getAllOnesConstant(DL, VT);
+ case ISD::SMAX:
+ return N2.getOpcode() == ISD::POISON
+ ? getPOISON(VT)
+ : getConstant(
+ APInt::getSignedMaxValue(VT.getScalarSizeInBits()), DL,
+ VT);
+ case ISD::SMIN:
+ return N2.getOpcode() == ISD::POISON
+ ? getPOISON(VT)
+ : getConstant(
+ APInt::getSignedMinValue(VT.getScalarSizeInBits()), DL,
+ VT);
}
}
diff --git a/llvm/test/CodeGen/AArch64/min-max-combine.ll b/llvm/test/CodeGen/AArch64/min-max-combine.ll
index 5111f838b73aa..8ae659c1abacf 100644
--- a/llvm/test/CodeGen/AArch64/min-max-combine.ll
+++ b/llvm/test/CodeGen/AArch64/min-max-combine.ll
@@ -197,3 +197,170 @@ define i64 @smini64_zero(i64 %a) {
%c = call i64 @llvm.smin.i64(i64 %a, i64 0)
ret i64 %c
}
+
+define i32 @smaxi32_poison(i32 %a) {
+; CHECK-ISEL-LABEL: smaxi32_poison:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: smaxi32_poison:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: cmp w0, w8
+; CHECK-GLOBAL-NEXT: csel w0, w0, w8, gt
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.smax.i32(i32 %a, i32 poison)
+ ret i32 %c
+}
+
+define i32 @smini32_poison(i32 %a) {
+; CHECK-ISEL-LABEL: smini32_poison:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: smini32_poison:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: cmp w0, w8
+; CHECK-GLOBAL-NEXT: csel w0, w0, w8, lt
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.smin.i32(i32 %a, i32 poison)
+ ret i32 %c
+}
+
+define i32 @umaxi32_poison(i32 %a) {
+; CHECK-ISEL-LABEL: umaxi32_poison:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: umaxi32_poison:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: cmp w0, w8
+; CHECK-GLOBAL-NEXT: csel w0, w0, w8, hi
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.umax.i32(i32 %a, i32 poison)
+ ret i32 %c
+}
+
+define i32 @umini32_poison(i32 %a) {
+; CHECK-ISEL-LABEL: umini32_poison:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: umini32_poison:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: cmp w0, w8
+; CHECK-GLOBAL-NEXT: csel w0, w0, w8, lo
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.umin.i32(i32 %a, i32 poison)
+ ret i32 %c
+}
+
+define i32 @smaxi32_undef(i32 %a) {
+; CHECK-ISEL-LABEL: smaxi32_undef:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: mov w0, #2147483647 // =0x7fffffff
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: smaxi32_undef:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: cmp w0, w8
+; CHECK-GLOBAL-NEXT: csel w0, w0, w8, gt
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.smax.i32(i32 %a, i32 undef)
+ ret i32 %c
+}
+
+define i32 @smini32_undef(i32 %a) {
+; CHECK-ISEL-LABEL: smini32_undef:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: mov w0, #-2147483648 // =0x80000000
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: smini32_undef:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: cmp w0, w8
+; CHECK-GLOBAL-NEXT: csel w0, w0, w8, lt
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.smin.i32(i32 %a, i32 undef)
+ ret i32 %c
+}
+
+define i32 @umaxi32_undef(i32 %a) {
+; CHECK-ISEL-LABEL: umaxi32_undef:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: mov w0, #-1 // =0xffffffff
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: umaxi32_undef:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: cmp w0, w8
+; CHECK-GLOBAL-NEXT: csel w0, w0, w8, hi
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.umax.i32(i32 %a, i32 undef)
+ ret i32 %c
+}
+
+define i32 @umini32_undef(i32 %a) {
+; CHECK-ISEL-LABEL: umini32_undef:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: mov w0, wzr
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: umini32_undef:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: cmp w0, w8
+; CHECK-GLOBAL-NEXT: csel w0, w0, w8, lo
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.umin.i32(i32 %a, i32 undef)
+ ret i32 %c
+}
+
+define i32 @smaxi32_undef_undef() {
+; CHECK-ISEL-LABEL: smaxi32_undef_undef:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: smaxi32_undef_undef:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.smax.i32(i32 undef, i32 undef)
+ ret i32 %c
+}
+
+define i32 @smini32_undef_undef() {
+; CHECK-ISEL-LABEL: smini32_undef_undef:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: smini32_undef_undef:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.smin.i32(i32 undef, i32 undef)
+ ret i32 %c
+}
+
+define i32 @umaxi32_undef_undef() {
+; CHECK-ISEL-LABEL: umaxi32_undef_undef:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: umaxi32_undef_undef:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.umax.i32(i32 undef, i32 undef)
+ ret i32 %c
+}
+
+define i32 @umini32_undef_undef() {
+; CHECK-ISEL-LABEL: umini32_undef_undef:
+; CHECK-ISEL: // %bb.0:
+; CHECK-ISEL-NEXT: ret
+;
+; CHECK-GLOBAL-LABEL: umini32_undef_undef:
+; CHECK-GLOBAL: // %bb.0:
+; CHECK-GLOBAL-NEXT: ret
+ %c = tail call i32 @llvm.umin.i32(i32 undef, i32 undef)
+ ret i32 %c
+}
+
+declare i32 @llvm.umax.i32(i32, i32) readnone
+declare i32 @llvm.umin.i32(i32, i32) readnone
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index f0829b53168d9..4f00a8a99fac7 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -604,18 +604,18 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v8i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX7-SDAG-NEXT: v_max_i32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -698,15 +698,15 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_smax_v8i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX9-SDAG-NEXT: v_max_i16_sdwa v3, sext(v3), sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-SDAG-NEXT: v_max_i16_sdwa v2, sext(v2), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX9-SDAG-NEXT: v_max_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-SDAG-NEXT: v_max_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_max3_i16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_max3_i16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_max_i16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -741,20 +741,20 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_smax_v8i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX10-SDAG-NEXT: v_max_i16 v3, v3, v7
+; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX10-SDAG-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v6, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX10-SDAG-NEXT: v_max_i16 v1, v1, v5
+; GFX10-SDAG-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v4, v6, 0, 8
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max_i16 v2, v2, v3
+; GFX10-SDAG-NEXT: v_max_i16 v0, v0, v3
; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max3_i16 v0, v0, v4, v2
+; GFX10-SDAG-NEXT: v_max3_i16 v0, v0, v2, v4
; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-SDAG-NEXT: v_max_i16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -796,62 +796,62 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v8i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v1, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v1.l, v5.l, v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v5, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v6
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v8i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v6, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v6, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v2, v2, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v3
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v2, v4
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v1
@@ -906,39 +906,39 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v1, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v1.l, v5.l, v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v5.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v5, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX12-SDAG-TRU...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/149334
More information about the llvm-commits
mailing list