[llvm] DAG: Pass flags to FoldConstantArithmetic (PR #93663)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 29 03:35:56 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
There is simply way too much going on inside getNode. The complicated
constant folding of vector handling works by looking for build_vector
operands, and then tries to getNode the scalar element and then checks if
constants were the result. As a side effect, this produces unused scalar
operation nodes (previously, without flags). If the vector operation
were later scalarized, it would find the flagless constant folding
temporary and lose the flag. I don't think this is a reasonable way for
constant folding to operate, but for now fix this by ensuring flags
on the original operation are preserved in the temporary.
This yields a clear code improvement for AMDGPU when f16 isn't legal.
The Wasm cases switch from using a libcall to compare and select. We are evidently
missing the fcmp+select to fminimum/fmaximum handling, but this would be further
improved when that's handled. AArch64 also avoids the libcall, but looks worse and
has a different call for some reason.
---
Patch is 34.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93663.diff
7 Files Affected:
- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+2-1)
- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (+2-1)
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+3-3)
- (modified) llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll (+13-1)
- (modified) llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll (+13-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+48-90)
- (modified) llvm/test/CodeGen/WebAssembly/simd-arith.ll (+122-98)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 0dc237301abb4..23358009c7ad0 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1880,7 +1880,8 @@ class SelectionDAG {
const SDNode *N2);
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
- ArrayRef<SDValue> Ops);
+ ArrayRef<SDValue> Ops,
+ SDNodeFlags Flags = SDNodeFlags());
/// Fold floating-point operations when all operands are constants and/or
/// undefined.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bfc2273c9425c..51f2cf9017f85 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4159,7 +4159,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
"expanded.");
EVT CCVT = getSetCCResultType(CmpVT);
SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC, Node->getFlags());
- Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
+ Results.push_back(
+ DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4, Node->getFlags()));
break;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b05649c6ce955..af29046040617 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6306,7 +6306,7 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
}
SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
- EVT VT, ArrayRef<SDValue> Ops) {
+ EVT VT, ArrayRef<SDValue> Ops, SDNodeFlags Flags) {
// If the opcode is a target-specific ISD node, there's nothing we can
// do here and the operand rules may not line up with the below, so
// bail early.
@@ -6663,7 +6663,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
}
// Constant fold the scalar operands.
- SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps);
+ SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
// Legalize the (integer) scalar constant if necessary.
if (LegalSVT != SVT)
@@ -7234,7 +7234,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
}
// Perform trivial constant folding.
- if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}))
+ if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
return SV;
// Canonicalize an UNDEF to the RHS, even over a constant.
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index 4c02a5240ba6a..c993051ccebf7 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -648,7 +648,19 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind {
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
; CHECK-LABEL: test_v2f128:
; CHECK: // %bb.0:
-; CHECK-NEXT: b fmaxl
+; CHECK-NEXT: sub sp, sp, #48
+; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-NEXT: bl __gttf2
+; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: b.le .LBB18_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: .LBB18_2:
+; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ret
%b = call nnan fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
ret fp128 %b
}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index 18d40cb18ba60..0116be51dd696 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -648,7 +648,19 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind {
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
; CHECK-LABEL: test_v2f128:
; CHECK: // %bb.0:
-; CHECK-NEXT: b fminl
+; CHECK-NEXT: sub sp, sp, #48
+; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-NEXT: bl __lttf2
+; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: b.ge .LBB18_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: .LBB18_2:
+; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ret
%b = call nnan fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
ret fp128 %b
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 7d7a462597104..0c13c5348879b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -654,21 +654,16 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX7-LABEL: v_maximum_v2f16__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000
-; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX7-NEXT: v_max_f32_e32 v2, v1, v3
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v2f16__nnan:
@@ -847,21 +842,16 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
; GFX7-LABEL: v_maximum_v2f16__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000
-; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX7-NEXT: v_max_f32_e32 v2, v1, v3
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v2f16__nnan_nsz:
@@ -1216,28 +1206,21 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX7-LABEL: v_maximum_v3f16__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_max_f32_e32 v6, v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX7-NEXT: v_max_f32_e32 v3, v1, v4
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX7-NEXT: v_max_f32_e32 v3, v2, v5
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v3f16__nnan:
@@ -1427,28 +1410,21 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX7-LABEL: v_maximum_v3f16__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_max_f32_e32 v6, v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX7-NEXT: v_max_f32_e32 v3, v1, v4
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX7-NEXT: v_max_f32_e32 v3, v2, v5
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v3f16__nnan_nsz:
@@ -1671,35 +1647,26 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
; GFX7-LABEL: v_maximum_v4f16__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_max_f32_e32 v8, v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX7-NEXT: v_max_f32_e32 v4, v1, v5
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX7-NEXT: v_max_f32_e32 v4, v2, v6
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v7
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v4f16__nnan:
@@ -1924,35 +1891,26 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
; GFX7-LABEL: v_maximum_v4f16__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_max_f32_e32 v8, v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX7-NEXT: v_max_f32_e32 v4, v1, v5
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX7-NEXT: v_max_f32_e32 v4, v2, v6
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v7
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v4f16__nnan_nsz:
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 761a75418a00f..67388b688e3bb 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -11788,27 +11788,35 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
; NO-SIMD128-LABEL: minnum_intrinsic_v4f32:
; NO-SIMD128: .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: call $push0=, fminf, $4, $8
-; NO-SIMD128-NEXT: f32.store 12($0), $pop0
-; NO-SIMD128-NEXT: call $push1=, fminf, $3, $7
-; NO-SIMD128-NEXT: f32.store 8($0), $pop1
-; NO-SIMD128-NEXT: call $push2=, fminf, $2, $6
-; NO-SIMD128-NEXT: f32.store 4($0), $pop2
-; NO-SIMD128-NEXT: call $push3=, fminf, $1, $5
-; NO-SIMD128-NEXT: f32.store 0($0), $pop3
+; NO-SIMD128-NEXT: f32.lt $push0=, $4, $8
+; NO-SIMD128-NEXT: f32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT: f32.store 12($0), $pop1
+; NO-SIMD128-NEXT: f32.lt $push2=, $3, $7
+; NO-SIMD128-NEXT: f32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT: f32.store 8($0), $pop3
+; NO-SIMD128-NEXT: f32.lt $push4=, $2, $6
+; NO-SIMD128-NEXT: f32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT: f32.store 4($0), $pop5
+; NO-SIMD128-NEXT: f32.lt $push6=, $1, $5
+; NO-SIMD128-NEXT: f32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT: f32.store 0($0), $pop7
; NO-SIMD128-NEXT: return
;
; NO-SIMD128-FAST-LABEL: minnum_intrinsic_v4f32:
; NO-SIMD128-FAST: .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
; NO-SIMD128-FAST-NEXT: # %bb.0:
-; NO-SIMD128-FAST-NEXT: call $push0=, fminf, $1, $5
-; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0
-; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $2, $6
-; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1
-; NO-SIMD128-FAST-NEXT: call $push2=, fminf, $3, $7
-; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3
+; NO-SIMD128-FAST-NEXT: f32.lt $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT: f32.select $push1=, $1, $5, $pop0
+; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT: f32.lt $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT: f32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT: f32.lt $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT: f32.select $push5=, $3, $7, $pop4
+; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT: f32.lt $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT: f32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop7
; NO-SIMD128-FAST-NEXT: return
%a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
ret <4 x float> %a
@@ -11830,26 +11838,26 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
; NO-SIMD128-LABEL: minnum_nsz_intrinsic_v4f32:
; NO-SIMD128: .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT: f32.min $push0=, $4, $8
; NO-SIMD128-NEXT: f32.store 12($0), $pop0
-; NO-SIMD128-NEXT: call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT: f32.min $push1=, $3, $7
; NO-SIMD128-NEXT: f32.store 8($0), $pop1
-; NO-SIMD128-NEXT: call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT: f32.min $push2=, $2, $6
; NO-SIMD128-NEXT: f32.store 4($0), $pop2
-; NO-SIMD128-NEXT: call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT: f32.min $push3=, $1, $5
; NO-SIMD128-NEXT: f32.store 0($0), $pop3
; NO-SIMD128-NEXT: return
;
; NO-SIMD128-FAST-LABEL: minnum_nsz_intrinsic_v4f32:
; NO-SIMD128-FAST: .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
; NO-SIMD128-FAST-NEXT: # %bb.0:
-; NO-SIMD128-FAST-NEXT: call $push0=, fminf, $1, $5
+; NO-SIMD128-FAST-NEXT: f32.min $push0=, $1, $5
; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0
-; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $2, $6
+; NO-SIMD128-FAST-NEXT: f32.min $push1=, $2, $6
; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1
-; NO-SIMD128-FAST-NEXT: call $push2=, fminf, $3, $7
+; NO-SIMD128-FAST-NEXT: f32.min $push2=, $3, $7
; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT: f32.min $push3=, $4, $8
; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3
; NO-SIMD128-FAST-NEXT: return
%a = call nnan nsz <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
@@ -11875,16 +11883,16 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
; NO-SIMD128: .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT: call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT: f32.min $push1=, $4, $pop0
; NO-SIMD128-NEXT: f32.store 12($0), $pop1
; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0
-; NO-SIMD128-NEXT: call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT: f32.min $push2=, $3, $pop7
; NO-SIMD128-NEXT: f32.store 8($0),...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/93663
More information about the llvm-commits
mailing list