[llvm] [WebAssembly] Fold fadd contract (fmul contract) to relaxed madd w/ -mattr=+simd128,+relaxed-simd (PR #147487)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 11 02:02:08 PDT 2025
https://github.com/badumbatish updated https://github.com/llvm/llvm-project/pull/147487
>From 683fae7878c6c9250bf7142a2fd16170aa734f71 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Tue, 8 Jul 2025 01:05:36 -0700
Subject: [PATCH 1/6] Precommit test for #121311
---
.../CodeGen/WebAssembly/simd-relaxed-fma.ll | 66 +++++++++++++++++++
1 file changed, 66 insertions(+)
create mode 100644 llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
new file mode 100644
index 0000000000000..ea3ee2a33cfa4
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s
+target triple = "wasm32"
+define void @fma_seperate(ptr %a, ptr %b, ptr %c, ptr %dest) {
+; CHECK-LABEL: fma_seperate:
+; CHECK: .functype fma_seperate (i32, i32, i32, i32) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0
+; CHECK-NEXT: v128.load $push0=, 0($0):p2align=0
+; CHECK-NEXT: f32x4.mul $push2=, $pop1, $pop0
+; CHECK-NEXT: v128.load $push3=, 0($2):p2align=0
+; CHECK-NEXT: f32x4.add $push4=, $pop2, $pop3
+; CHECK-NEXT: v128.store 0($3):p2align=0, $pop4
+; CHECK-NEXT: return
+entry:
+ %0 = load <4 x float>, ptr %a, align 1
+ %1 = load <4 x float>, ptr %b, align 1
+ %2 = load <4 x float>, ptr %c, align 1
+ %mul.i = fmul fast <4 x float> %1, %0
+ %add.i = fadd fast <4 x float> %mul.i, %2
+ store <4 x float> %add.i, ptr %dest, align 1
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define void @fma_llvm(ptr %a, ptr %b, ptr %c, ptr %dest) {
+; CHECK-LABEL: fma_llvm:
+; CHECK: .functype fma_llvm (i32, i32, i32, i32) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: v128.load $push25=, 0($0):p2align=0
+; CHECK-NEXT: local.tee $push24=, $6=, $pop25
+; CHECK-NEXT: f32x4.extract_lane $push2=, $pop24, 0
+; CHECK-NEXT: v128.load $push23=, 0($1):p2align=0
+; CHECK-NEXT: local.tee $push22=, $5=, $pop23
+; CHECK-NEXT: f32x4.extract_lane $push1=, $pop22, 0
+; CHECK-NEXT: v128.load $push21=, 0($2):p2align=0
+; CHECK-NEXT: local.tee $push20=, $4=, $pop21
+; CHECK-NEXT: f32x4.extract_lane $push0=, $pop20, 0
+; CHECK-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0
+; CHECK-NEXT: f32x4.splat $push4=, $pop3
+; CHECK-NEXT: f32x4.extract_lane $push7=, $6, 1
+; CHECK-NEXT: f32x4.extract_lane $push6=, $5, 1
+; CHECK-NEXT: f32x4.extract_lane $push5=, $4, 1
+; CHECK-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5
+; CHECK-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; CHECK-NEXT: f32x4.extract_lane $push12=, $6, 2
+; CHECK-NEXT: f32x4.extract_lane $push11=, $5, 2
+; CHECK-NEXT: f32x4.extract_lane $push10=, $4, 2
+; CHECK-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10
+; CHECK-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; CHECK-NEXT: f32x4.extract_lane $push17=, $6, 3
+; CHECK-NEXT: f32x4.extract_lane $push16=, $5, 3
+; CHECK-NEXT: f32x4.extract_lane $push15=, $4, 3
+; CHECK-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
+; CHECK-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; CHECK-NEXT: v128.store 0($3):p2align=0, $pop19
+; CHECK-NEXT: return
+entry:
+ %0 = load <4 x float>, ptr %a, align 1
+ %1 = load <4 x float>, ptr %b, align 1
+ %2 = load <4 x float>, ptr %c, align 1
+ %fma = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+ store <4 x float> %fma, ptr %dest, align 1
+ ret void
+}
>From b1c4c01dd18259980d8faae6a9e4f71cb30208c6 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Tue, 8 Jul 2025 01:49:37 -0700
Subject: [PATCH 2/6] [WASM] Optimize fma when relaxed and ffast-math
Fixes #121311, which folds a series of multiply and add to wasm.fma when
we have -mrelaxed-simd and -ffast-math.
Also attempted to use wasm.fma instead of the built in llvm.fma
---
llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 1 +
.../WebAssembly/WebAssemblyISelLowering.cpp | 41 +++++++++++++++++++
.../CodeGen/WebAssembly/simd-relaxed-fma.ll | 39 ++++--------------
3 files changed, 50 insertions(+), 31 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index a3675eecfea3f..ec566b168bc3d 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -475,6 +475,7 @@ struct SDNodeFlags {
bool hasAllowReassociation() const { return Flags & AllowReassociation; }
bool hasNoFPExcept() const { return Flags & NoFPExcept; }
bool hasUnpredictable() const { return Flags & Unpredictable; }
+ bool hasFastMath() const { return Flags & FastMathFlags; }
bool operator==(const SDNodeFlags &Other) const {
return Flags == Other.Flags;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index bf2e04caa0a61..ef0146f28aba1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -182,6 +182,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// SIMD-specific configuration
if (Subtarget->hasSIMD128()) {
+ // Enable fma optimization for wasm relaxed simd
+ if (Subtarget->hasRelaxedSIMD()) {
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FMA);
+ }
+
// Combine partial.reduce.add before legalization gets confused.
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -3412,6 +3418,37 @@ static SDValue performSETCCCombine(SDNode *N,
return SDValue();
}
+static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::FADD);
+ using namespace llvm::SDPatternMatch;
+ if (!N->getFlags().hasFastMath())
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue A, B, C;
+ EVT VecVT = N->getValueType(0);
+ if (sd_match(N, m_FAdd(m_Value(A), m_FMul(m_Value(B), m_Value(C)))))
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, VecVT,
+ {DAG.getConstant(Intrinsic::wasm_relaxed_madd, DL, MVT::i32), A, B, C});
+
+ return SDValue();
+}
+
+static SDValue performFMACombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::FMA);
+ if (!N->getFlags().hasFastMath())
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue A = N->getOperand(0), B = N->getOperand(1), C = N->getOperand(2);
+ EVT VecVT = N->getValueType(0);
+
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, VecVT,
+ {DAG.getConstant(Intrinsic::wasm_relaxed_madd, DL, MVT::i32), A, B, C});
+}
+
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::MUL);
EVT VT = N->getValueType(0);
@@ -3529,6 +3566,10 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return AnyAllCombine;
return performLowerPartialReduction(N, DCI.DAG);
}
+ case ISD::FADD:
+ return performFAddCombine(N, DCI.DAG);
+ case ISD::FMA:
+ return performFMACombine(N, DCI.DAG);
case ISD::MUL:
return performMulCombine(N, DCI.DAG);
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index ea3ee2a33cfa4..fe5e8573f12b4 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -6,12 +6,11 @@ define void @fma_seperate(ptr %a, ptr %b, ptr %c, ptr %dest) {
; CHECK-LABEL: fma_seperate:
; CHECK: .functype fma_seperate (i32, i32, i32, i32) -> ()
; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: v128.load $push2=, 0($2):p2align=0
; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0
; CHECK-NEXT: v128.load $push0=, 0($0):p2align=0
-; CHECK-NEXT: f32x4.mul $push2=, $pop1, $pop0
-; CHECK-NEXT: v128.load $push3=, 0($2):p2align=0
-; CHECK-NEXT: f32x4.add $push4=, $pop2, $pop3
-; CHECK-NEXT: v128.store 0($3):p2align=0, $pop4
+; CHECK-NEXT: f32x4.relaxed_madd $push3=, $pop2, $pop1, $pop0
+; CHECK-NEXT: v128.store 0($3):p2align=0, $pop3
; CHECK-NEXT: return
entry:
%0 = load <4 x float>, ptr %a, align 1
@@ -28,33 +27,11 @@ define void @fma_llvm(ptr %a, ptr %b, ptr %c, ptr %dest) {
; CHECK-LABEL: fma_llvm:
; CHECK: .functype fma_llvm (i32, i32, i32, i32) -> ()
; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: v128.load $push25=, 0($0):p2align=0
-; CHECK-NEXT: local.tee $push24=, $6=, $pop25
-; CHECK-NEXT: f32x4.extract_lane $push2=, $pop24, 0
-; CHECK-NEXT: v128.load $push23=, 0($1):p2align=0
-; CHECK-NEXT: local.tee $push22=, $5=, $pop23
-; CHECK-NEXT: f32x4.extract_lane $push1=, $pop22, 0
-; CHECK-NEXT: v128.load $push21=, 0($2):p2align=0
-; CHECK-NEXT: local.tee $push20=, $4=, $pop21
-; CHECK-NEXT: f32x4.extract_lane $push0=, $pop20, 0
-; CHECK-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0
-; CHECK-NEXT: f32x4.splat $push4=, $pop3
-; CHECK-NEXT: f32x4.extract_lane $push7=, $6, 1
-; CHECK-NEXT: f32x4.extract_lane $push6=, $5, 1
-; CHECK-NEXT: f32x4.extract_lane $push5=, $4, 1
-; CHECK-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5
-; CHECK-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8
-; CHECK-NEXT: f32x4.extract_lane $push12=, $6, 2
-; CHECK-NEXT: f32x4.extract_lane $push11=, $5, 2
-; CHECK-NEXT: f32x4.extract_lane $push10=, $4, 2
-; CHECK-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10
-; CHECK-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13
-; CHECK-NEXT: f32x4.extract_lane $push17=, $6, 3
-; CHECK-NEXT: f32x4.extract_lane $push16=, $5, 3
-; CHECK-NEXT: f32x4.extract_lane $push15=, $4, 3
-; CHECK-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
-; CHECK-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
-; CHECK-NEXT: v128.store 0($3):p2align=0, $pop19
+; CHECK-NEXT: v128.load $push2=, 0($0):p2align=0
+; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0
+; CHECK-NEXT: v128.load $push0=, 0($2):p2align=0
+; CHECK-NEXT: f32x4.relaxed_madd $push3=, $pop2, $pop1, $pop0
+; CHECK-NEXT: v128.store 0($3):p2align=0, $pop3
; CHECK-NEXT: return
entry:
%0 = load <4 x float>, ptr %a, align 1
>From 4aa43a19f628d5518b0cd9775b70be850ee316a8 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Tue, 8 Jul 2025 17:05:44 -0700
Subject: [PATCH 3/6] [WASM] Fix nits for PR 147487
- Fix inefficient wasm test case.
- Added scalar test case and more floating type.
- Remove total ffast checking -> allowContract
---
llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 1 -
.../WebAssembly/WebAssemblyISelLowering.cpp | 22 ++-
.../CodeGen/WebAssembly/simd-relaxed-fma.ll | 135 +++++++++++++-----
3 files changed, 114 insertions(+), 44 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index ec566b168bc3d..a3675eecfea3f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -475,7 +475,6 @@ struct SDNodeFlags {
bool hasAllowReassociation() const { return Flags & AllowReassociation; }
bool hasNoFPExcept() const { return Flags & NoFPExcept; }
bool hasUnpredictable() const { return Flags & Unpredictable; }
- bool hasFastMath() const { return Flags & FastMathFlags; }
bool operator==(const SDNodeFlags &Other) const {
return Flags == Other.Flags;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index ef0146f28aba1..a79fb0781cd4d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3421,12 +3421,21 @@ static SDValue performSETCCCombine(SDNode *N,
static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::FADD);
using namespace llvm::SDPatternMatch;
- if (!N->getFlags().hasFastMath())
+
+ EVT VecVT = N->getValueType(0);
+
+ // WebAssembly doesn't have scalar fma yet
+ // https://github.com/WebAssembly/design/issues/1391
+ if (!VecVT.isVector())
+ return SDValue();
+
+ // Allows fp fusing
+ if (!N->getFlags().hasAllowContract())
return SDValue();
SDLoc DL(N);
SDValue A, B, C;
- EVT VecVT = N->getValueType(0);
+
if (sd_match(N, m_FAdd(m_Value(A), m_FMul(m_Value(B), m_Value(C)))))
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, VecVT,
@@ -3437,12 +3446,17 @@ static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) {
static SDValue performFMACombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::FMA);
- if (!N->getFlags().hasFastMath())
+
+ EVT VecVT = N->getValueType(0);
+ if (!VecVT.isVector())
+ return SDValue();
+
+ // Allows fp fusing
+ if (!N->getFlags().hasAllowContract())
return SDValue();
SDLoc DL(N);
SDValue A = N->getOperand(0), B = N->getOperand(1), C = N->getOperand(2);
- EVT VecVT = N->getValueType(0);
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, VecVT,
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index fe5e8573f12b4..882b0538af74f 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -1,43 +1,100 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s
target triple = "wasm32"
-define void @fma_seperate(ptr %a, ptr %b, ptr %c, ptr %dest) {
-; CHECK-LABEL: fma_seperate:
-; CHECK: .functype fma_seperate (i32, i32, i32, i32) -> ()
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: v128.load $push2=, 0($2):p2align=0
-; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0
-; CHECK-NEXT: v128.load $push0=, 0($0):p2align=0
-; CHECK-NEXT: f32x4.relaxed_madd $push3=, $pop2, $pop1, $pop0
-; CHECK-NEXT: v128.store 0($3):p2align=0, $pop3
-; CHECK-NEXT: return
-entry:
- %0 = load <4 x float>, ptr %a, align 1
- %1 = load <4 x float>, ptr %b, align 1
- %2 = load <4 x float>, ptr %c, align 1
- %mul.i = fmul fast <4 x float> %1, %0
- %add.i = fadd fast <4 x float> %mul.i, %2
- store <4 x float> %add.i, ptr %dest, align 1
- ret void
-}
-
-; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define void @fma_llvm(ptr %a, ptr %b, ptr %c, ptr %dest) {
-; CHECK-LABEL: fma_llvm:
-; CHECK: .functype fma_llvm (i32, i32, i32, i32) -> ()
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: v128.load $push2=, 0($0):p2align=0
-; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0
-; CHECK-NEXT: v128.load $push0=, 0($2):p2align=0
-; CHECK-NEXT: f32x4.relaxed_madd $push3=, $pop2, $pop1, $pop0
-; CHECK-NEXT: v128.store 0($3):p2align=0, $pop3
-; CHECK-NEXT: return
-entry:
- %0 = load <4 x float>, ptr %a, align 1
- %1 = load <4 x float>, ptr %b, align 1
- %2 = load <4 x float>, ptr %c, align 1
- %fma = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
- store <4 x float> %fma, ptr %dest, align 1
- ret void
+define <4 x float> @fma_vector_4xf32_seperate(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: fma_vector_4xf32_seperate:
+; CHECK: .functype fma_vector_4xf32_seperate (v128, v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: f32x4.relaxed_madd $push0=, $2, $1, $0
+; CHECK-NEXT: return $pop0
+entry:
+ %mul.i = fmul fast <4 x float> %b, %a
+ %add.i = fadd fast <4 x float> %mul.i, %c
+ ret <4 x float> %add.i
+}
+
+define <4 x float> @fma_vector_4xf32_llvm(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: fma_vector_4xf32_llvm:
+; CHECK: .functype fma_vector_4xf32_llvm (v128, v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2
+; CHECK-NEXT: return $pop0
+entry:
+ %fma = tail call fast <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+ ret <4 x float> %fma
+}
+
+
+define <2 x double> @fma_vector_2xf64_seperate(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: fma_vector_2xf64_seperate:
+; CHECK: .functype fma_vector_2xf64_seperate (v128, v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: f64x2.relaxed_madd $push0=, $2, $1, $0
+; CHECK-NEXT: return $pop0
+entry:
+ %mul.i = fmul fast <2 x double> %b, %a
+ %add.i = fadd fast <2 x double> %mul.i, %c
+ ret <2 x double> %add.i
+}
+
+define <2 x double> @fma_vector_2xf64_llvm(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: fma_vector_2xf64_llvm:
+; CHECK: .functype fma_vector_2xf64_llvm (v128, v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2
+; CHECK-NEXT: return $pop0
+entry:
+ %fma = tail call fast <2 x double> @llvm.fma(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+ ret <2 x double> %fma
+}
+
+
+define float @fma_scalar_f32_seperate(float %a, float %b, float %c) {
+; CHECK-LABEL: fma_scalar_f32_seperate:
+; CHECK: .functype fma_scalar_f32_seperate (f32, f32, f32) -> (f32)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: f32.mul $push0=, $1, $0
+; CHECK-NEXT: f32.add $push1=, $pop0, $2
+; CHECK-NEXT: return $pop1
+entry:
+ %mul.i = fmul fast float %b, %a
+ %add.i = fadd fast float %mul.i, %c
+ ret float %add.i
+}
+
+define float @fma_scalar_f32_llvm(float %a, float %b, float %c) {
+; CHECK-LABEL: fma_scalar_f32_llvm:
+; CHECK: .functype fma_scalar_f32_llvm (f32, f32, f32) -> (f32)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: call $push0=, fmaf, $0, $1, $2
+; CHECK-NEXT: return $pop0
+entry:
+ %fma = tail call fast float @llvm.fma(float %a, float %b, float %c)
+ ret float %fma
+}
+
+
+define double @fma_scalar_f64_seperate(double %a, double %b, double %c) {
+; CHECK-LABEL: fma_scalar_f64_seperate:
+; CHECK: .functype fma_scalar_f64_seperate (f64, f64, f64) -> (f64)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: f64.mul $push0=, $1, $0
+; CHECK-NEXT: f64.add $push1=, $pop0, $2
+; CHECK-NEXT: return $pop1
+entry:
+ %mul.i = fmul fast double %b, %a
+ %add.i = fadd fast double %mul.i, %c
+ ret double %add.i
+}
+
+define double @fma_scalar_f64_llvm(double %a, double %b, double %c) {
+; CHECK-LABEL: fma_scalar_f64_llvm:
+; CHECK: .functype fma_scalar_f64_llvm (f64, f64, f64) -> (f64)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: call $push0=, fma, $0, $1, $2
+; CHECK-NEXT: return $pop0
+entry:
+ %fma = tail call fast double @llvm.fma(double %a, double %b, double %c)
+ ret double %fma
}
>From 02569b69a380994e9a9182f904660e4096a3df21 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Tue, 8 Jul 2025 17:37:26 -0700
Subject: [PATCH 4/6] [WASM] Add more vector widths for PR 147487
- Added support for <8 x f32>.
- Refactored out condition for relaxed simd to a seperate function.
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 48 ++++++++++++-------
.../CodeGen/WebAssembly/simd-relaxed-fma.ll | 30 ++++++++++++
2 files changed, 60 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index a79fb0781cd4d..15038df6d5f6c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -27,6 +27,7 @@
#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGenTypes/MachineValueType.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Function.h"
@@ -3417,25 +3418,40 @@ static SDValue performSETCCCombine(SDNode *N,
}
return SDValue();
}
+static bool canRelaxSimd(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VecVT = N->getValueType(0);
-static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) {
+ // INFO: WebAssembly doesn't have scalar fma yet
+ // https://github.com/WebAssembly/design/issues/1391
+ if (!VecVT.isVector())
+ return false;
+
+ // Allows fp fusing
+ if (!N->getFlags().hasAllowContract())
+ return false;
+
+ if (N->getValueType(0).bitsGT(MVT::f128))
+ return false;
+
+ return true;
+}
+static SDValue performFAddCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
assert(N->getOpcode() == ISD::FADD);
using namespace llvm::SDPatternMatch;
- EVT VecVT = N->getValueType(0);
-
- // WebAssembly doesn't have scalar fma yet
+ // INFO: WebAssembly doesn't have scalar fma yet
// https://github.com/WebAssembly/design/issues/1391
+ EVT VecVT = N->getValueType(0);
if (!VecVT.isVector())
return SDValue();
- // Allows fp fusing
- if (!N->getFlags().hasAllowContract())
+ if (!canRelaxSimd(N, DCI))
return SDValue();
SDLoc DL(N);
SDValue A, B, C;
-
+ SelectionDAG &DAG = DCI.DAG;
if (sd_match(N, m_FAdd(m_Value(A), m_FMul(m_Value(B), m_Value(C)))))
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, VecVT,
@@ -3444,22 +3460,18 @@ static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue performFMACombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performFMACombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
assert(N->getOpcode() == ISD::FMA);
- EVT VecVT = N->getValueType(0);
- if (!VecVT.isVector())
- return SDValue();
-
- // Allows fp fusing
- if (!N->getFlags().hasAllowContract())
+ if (!canRelaxSimd(N, DCI))
return SDValue();
SDLoc DL(N);
SDValue A = N->getOperand(0), B = N->getOperand(1), C = N->getOperand(2);
-
+ SelectionDAG &DAG = DCI.DAG;
return DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, VecVT,
+ ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
{DAG.getConstant(Intrinsic::wasm_relaxed_madd, DL, MVT::i32), A, B, C});
}
@@ -3581,9 +3593,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return performLowerPartialReduction(N, DCI.DAG);
}
case ISD::FADD:
- return performFAddCombine(N, DCI.DAG);
+ return performFAddCombine(N, DCI);
case ISD::FMA:
- return performFMACombine(N, DCI.DAG);
+ return performFMACombine(N, DCI);
case ISD::MUL:
return performMulCombine(N, DCI.DAG);
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index 882b0538af74f..e4bd6a3a8cda6 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -26,6 +26,36 @@ entry:
}
+define <8 x float> @fma_vector_8xf32_seperate(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fma_vector_8xf32_seperate:
+; CHECK: .functype fma_vector_8xf32_seperate (i32, v128, v128, v128, v128, v128, v128) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: f32x4.relaxed_madd $push0=, $6, $4, $2
+; CHECK-NEXT: v128.store 16($0), $pop0
+; CHECK-NEXT: f32x4.relaxed_madd $push1=, $5, $3, $1
+; CHECK-NEXT: v128.store 0($0), $pop1
+; CHECK-NEXT: return
+entry:
+ %mul.i = fmul fast <8 x float> %b, %a
+ %add.i = fadd fast <8 x float> %mul.i, %c
+ ret <8 x float> %add.i
+}
+
+define <8 x float> @fma_vector_8xf32_llvm(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fma_vector_8xf32_llvm:
+; CHECK: .functype fma_vector_8xf32_llvm (i32, v128, v128, v128, v128, v128, v128) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: f32x4.relaxed_madd $push0=, $2, $4, $6
+; CHECK-NEXT: v128.store 16($0), $pop0
+; CHECK-NEXT: f32x4.relaxed_madd $push1=, $1, $3, $5
+; CHECK-NEXT: v128.store 0($0), $pop1
+; CHECK-NEXT: return
+entry:
+ %fma = tail call fast <8 x float> @llvm.fma(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+ ret <8 x float> %fma
+}
+
+
define <2 x double> @fma_vector_2xf64_seperate(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: fma_vector_2xf64_seperate:
; CHECK: .functype fma_vector_2xf64_seperate (v128, v128, v128) -> (v128)
>From 63dc901bee2bd654b85a89cfe3cd37e2ce341faa Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Thu, 10 Jul 2025 11:44:56 -0700
Subject: [PATCH 5/6] Use tablegen instead of cpp, relax only seperate fmul
fadd
- Use tablegen instead of cpp
- Relax only seperate fmul fadd instead of fma.
- Fix test name, add negative testing, add fmuladd tests, both fast and
nonfast
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 67 ----
.../WebAssembly/WebAssemblyInstrSIMD.td | 23 ++
.../CodeGen/WebAssembly/simd-relaxed-fma.ll | 294 +++++++++++++-----
3 files changed, 243 insertions(+), 141 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 15038df6d5f6c..bf2e04caa0a61 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -27,7 +27,6 @@
#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGenTypes/MachineValueType.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Function.h"
@@ -183,12 +182,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// SIMD-specific configuration
if (Subtarget->hasSIMD128()) {
- // Enable fma optimization for wasm relaxed simd
- if (Subtarget->hasRelaxedSIMD()) {
- setTargetDAGCombine(ISD::FADD);
- setTargetDAGCombine(ISD::FMA);
- }
-
// Combine partial.reduce.add before legalization gets confused.
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -3418,62 +3411,6 @@ static SDValue performSETCCCombine(SDNode *N,
}
return SDValue();
}
-static bool canRelaxSimd(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
- EVT VecVT = N->getValueType(0);
-
- // INFO: WebAssembly doesn't have scalar fma yet
- // https://github.com/WebAssembly/design/issues/1391
- if (!VecVT.isVector())
- return false;
-
- // Allows fp fusing
- if (!N->getFlags().hasAllowContract())
- return false;
-
- if (N->getValueType(0).bitsGT(MVT::f128))
- return false;
-
- return true;
-}
-static SDValue performFAddCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- assert(N->getOpcode() == ISD::FADD);
- using namespace llvm::SDPatternMatch;
-
- // INFO: WebAssembly doesn't have scalar fma yet
- // https://github.com/WebAssembly/design/issues/1391
- EVT VecVT = N->getValueType(0);
- if (!VecVT.isVector())
- return SDValue();
-
- if (!canRelaxSimd(N, DCI))
- return SDValue();
-
- SDLoc DL(N);
- SDValue A, B, C;
- SelectionDAG &DAG = DCI.DAG;
- if (sd_match(N, m_FAdd(m_Value(A), m_FMul(m_Value(B), m_Value(C)))))
- return DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, VecVT,
- {DAG.getConstant(Intrinsic::wasm_relaxed_madd, DL, MVT::i32), A, B, C});
-
- return SDValue();
-}
-
-static SDValue performFMACombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- assert(N->getOpcode() == ISD::FMA);
-
- if (!canRelaxSimd(N, DCI))
- return SDValue();
-
- SDLoc DL(N);
- SDValue A = N->getOperand(0), B = N->getOperand(1), C = N->getOperand(2);
- SelectionDAG &DAG = DCI.DAG;
- return DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
- {DAG.getConstant(Intrinsic::wasm_relaxed_madd, DL, MVT::i32), A, B, C});
-}
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::MUL);
@@ -3592,10 +3529,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return AnyAllCombine;
return performLowerPartialReduction(N, DCI.DAG);
}
- case ISD::FADD:
- return performFAddCombine(N, DCI);
- case ISD::FMA:
- return performFMACombine(N, DCI);
case ISD::MUL:
return performMulCombine(N, DCI.DAG);
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index c591e5ef181a4..0e5546fa96fb2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1542,6 +1542,29 @@ defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+def fmul_fast : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b),[{
+ return N->getFlags().hasAllowContract();
+}]>;
+
+
+def fadd_fast : PatFrag<(ops node:$a, node:$b), (fadd node:$a, node:$b),[{
+ return N->getFlags().hasAllowContract();
+}]>;
+
+def : Pat<(fadd (v4f32 V128:$a), (fmul_fast (v4f32 V128:$b), (v4f32 V128:$c))),
+ (MADD_F32x4 V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+
+
+def : Pat<(fadd (v2f64 V128:$a), (fmul_fast (v2f64 V128:$b), (v2f64 V128:$c))),
+ (MADD_F64x2 V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+
+def : Pat<(fadd_fast (v4f32 V128:$a), (fmul (v4f32 V128:$b), (v4f32 V128:$c))),
+ (MADD_F32x4 V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+
+
+def : Pat<(fadd_fast (v2f64 V128:$a), (fmul (v2f64 V128:$b), (v2f64 V128:$c))),
+ (MADD_F64x2 V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+
//===----------------------------------------------------------------------===//
// Laneselect
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e4bd6a3a8cda6..cc6e713b67656 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -1,129 +1,275 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128, | FileCheck %s --check-prefix=STRICT
+
target triple = "wasm32"
-define <4 x float> @fma_vector_4xf32_seperate(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: fma_vector_4xf32_seperate:
-; CHECK: .functype fma_vector_4xf32_seperate (v128, v128, v128) -> (v128)
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: f32x4.relaxed_madd $push0=, $2, $1, $0
-; CHECK-NEXT: return $pop0
+define <4 x float> @fadd_fmul_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fadd_fmul_vector_4xf32:
+; RELAXED: .functype fadd_fmul_vector_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fadd_fmul_vector_4xf32:
+; STRICT: .functype fadd_fmul_vector_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f32x4.mul $push0=, $1, $0
+; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
entry:
%mul.i = fmul fast <4 x float> %b, %a
%add.i = fadd fast <4 x float> %mul.i, %c
ret <4 x float> %add.i
}
-define <4 x float> @fma_vector_4xf32_llvm(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: fma_vector_4xf32_llvm:
-; CHECK: .functype fma_vector_4xf32_llvm (v128, v128, v128) -> (v128)
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2
-; CHECK-NEXT: return $pop0
+
+define <4 x float> @fmuladd_fast_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_fast_vector_4xf32:
+; RELAXED: .functype fmuladd_fast_vector_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_fast_vector_4xf32:
+; STRICT: .functype fmuladd_fast_vector_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f32x4.mul $push0=, $0, $1
+; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+entry:
+ %fma = tail call fast <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+ ret <4 x float> %fma
+}
+
+
+define <4 x float> @fmuladd_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_vector_4xf32:
+; RELAXED: .functype fmuladd_vector_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f32x4.mul $push0=, $0, $1
+; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_vector_4xf32:
+; STRICT: .functype fmuladd_vector_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f32x4.mul $push0=, $0, $1
+; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+entry:
+ %fma = tail call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+ ret <4 x float> %fma
+}
+
+define <4 x float> @fma_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fma_vector_4xf32:
+; RELAXED: .functype fma_vector_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f32x4.extract_lane $push2=, $0, 0
+; RELAXED-NEXT: f32x4.extract_lane $push1=, $1, 0
+; RELAXED-NEXT: f32x4.extract_lane $push0=, $2, 0
+; RELAXED-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0
+; RELAXED-NEXT: f32x4.splat $push4=, $pop3
+; RELAXED-NEXT: f32x4.extract_lane $push7=, $0, 1
+; RELAXED-NEXT: f32x4.extract_lane $push6=, $1, 1
+; RELAXED-NEXT: f32x4.extract_lane $push5=, $2, 1
+; RELAXED-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5
+; RELAXED-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; RELAXED-NEXT: f32x4.extract_lane $push12=, $0, 2
+; RELAXED-NEXT: f32x4.extract_lane $push11=, $1, 2
+; RELAXED-NEXT: f32x4.extract_lane $push10=, $2, 2
+; RELAXED-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10
+; RELAXED-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; RELAXED-NEXT: f32x4.extract_lane $push17=, $0, 3
+; RELAXED-NEXT: f32x4.extract_lane $push16=, $1, 3
+; RELAXED-NEXT: f32x4.extract_lane $push15=, $2, 3
+; RELAXED-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
+; RELAXED-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; RELAXED-NEXT: return $pop19
+;
+; STRICT-LABEL: fma_vector_4xf32:
+; STRICT: .functype fma_vector_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f32x4.extract_lane $push2=, $0, 0
+; STRICT-NEXT: f32x4.extract_lane $push1=, $1, 0
+; STRICT-NEXT: f32x4.extract_lane $push0=, $2, 0
+; STRICT-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0
+; STRICT-NEXT: f32x4.splat $push4=, $pop3
+; STRICT-NEXT: f32x4.extract_lane $push7=, $0, 1
+; STRICT-NEXT: f32x4.extract_lane $push6=, $1, 1
+; STRICT-NEXT: f32x4.extract_lane $push5=, $2, 1
+; STRICT-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5
+; STRICT-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; STRICT-NEXT: f32x4.extract_lane $push12=, $0, 2
+; STRICT-NEXT: f32x4.extract_lane $push11=, $1, 2
+; STRICT-NEXT: f32x4.extract_lane $push10=, $2, 2
+; STRICT-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10
+; STRICT-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; STRICT-NEXT: f32x4.extract_lane $push17=, $0, 3
+; STRICT-NEXT: f32x4.extract_lane $push16=, $1, 3
+; STRICT-NEXT: f32x4.extract_lane $push15=, $2, 3
+; STRICT-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
+; STRICT-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; STRICT-NEXT: return $pop19
entry:
%fma = tail call fast <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
-define <8 x float> @fma_vector_8xf32_seperate(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
-; CHECK-LABEL: fma_vector_8xf32_seperate:
-; CHECK: .functype fma_vector_8xf32_seperate (i32, v128, v128, v128, v128, v128, v128) -> ()
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: f32x4.relaxed_madd $push0=, $6, $4, $2
-; CHECK-NEXT: v128.store 16($0), $pop0
-; CHECK-NEXT: f32x4.relaxed_madd $push1=, $5, $3, $1
-; CHECK-NEXT: v128.store 0($0), $pop1
-; CHECK-NEXT: return
+define <8 x float> @fadd_fmul_vector_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fadd_fmul_vector_8xf32:
+; RELAXED: .functype fadd_fmul_vector_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $6, $4, $2
+; RELAXED-NEXT: v128.store 16($0), $pop0
+; RELAXED-NEXT: f32x4.relaxed_madd $push1=, $5, $3, $1
+; RELAXED-NEXT: v128.store 0($0), $pop1
+; RELAXED-NEXT: return
+;
+; STRICT-LABEL: fadd_fmul_vector_8xf32:
+; STRICT: .functype fadd_fmul_vector_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f32x4.mul $push0=, $4, $2
+; STRICT-NEXT: f32x4.add $push1=, $pop0, $6
+; STRICT-NEXT: v128.store 16($0), $pop1
+; STRICT-NEXT: f32x4.mul $push2=, $3, $1
+; STRICT-NEXT: f32x4.add $push3=, $pop2, $5
+; STRICT-NEXT: v128.store 0($0), $pop3
+; STRICT-NEXT: return
entry:
%mul.i = fmul fast <8 x float> %b, %a
%add.i = fadd fast <8 x float> %mul.i, %c
ret <8 x float> %add.i
}
-define <8 x float> @fma_vector_8xf32_llvm(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
-; CHECK-LABEL: fma_vector_8xf32_llvm:
-; CHECK: .functype fma_vector_8xf32_llvm (i32, v128, v128, v128, v128, v128, v128) -> ()
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: f32x4.relaxed_madd $push0=, $2, $4, $6
-; CHECK-NEXT: v128.store 16($0), $pop0
-; CHECK-NEXT: f32x4.relaxed_madd $push1=, $1, $3, $5
-; CHECK-NEXT: v128.store 0($0), $pop1
-; CHECK-NEXT: return
-entry:
- %fma = tail call fast <8 x float> @llvm.fma(<8 x float> %a, <8 x float> %b, <8 x float> %c)
- ret <8 x float> %fma
-}
-
-define <2 x double> @fma_vector_2xf64_seperate(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: fma_vector_2xf64_seperate:
-; CHECK: .functype fma_vector_2xf64_seperate (v128, v128, v128) -> (v128)
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: f64x2.relaxed_madd $push0=, $2, $1, $0
-; CHECK-NEXT: return $pop0
+define <2 x double> @fadd_fmul_vector(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fadd_fmul_vector:
+; RELAXED: .functype fadd_fmul_vector (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fadd_fmul_vector:
+; STRICT: .functype fadd_fmul_vector (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f64x2.mul $push0=, $1, $0
+; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
entry:
%mul.i = fmul fast <2 x double> %b, %a
%add.i = fadd fast <2 x double> %mul.i, %c
ret <2 x double> %add.i
}
-define <2 x double> @fma_vector_2xf64_llvm(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: fma_vector_2xf64_llvm:
-; CHECK: .functype fma_vector_2xf64_llvm (v128, v128, v128) -> (v128)
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2
-; CHECK-NEXT: return $pop0
+define <2 x double> @fma_vector_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fma_vector_2xf64:
+; RELAXED: .functype fma_vector_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f64x2.extract_lane $push2=, $0, 0
+; RELAXED-NEXT: f64x2.extract_lane $push1=, $1, 0
+; RELAXED-NEXT: f64x2.extract_lane $push0=, $2, 0
+; RELAXED-NEXT: call $push3=, fma, $pop2, $pop1, $pop0
+; RELAXED-NEXT: f64x2.splat $push4=, $pop3
+; RELAXED-NEXT: f64x2.extract_lane $push7=, $0, 1
+; RELAXED-NEXT: f64x2.extract_lane $push6=, $1, 1
+; RELAXED-NEXT: f64x2.extract_lane $push5=, $2, 1
+; RELAXED-NEXT: call $push8=, fma, $pop7, $pop6, $pop5
+; RELAXED-NEXT: f64x2.replace_lane $push9=, $pop4, 1, $pop8
+; RELAXED-NEXT: return $pop9
+;
+; STRICT-LABEL: fma_vector_2xf64:
+; STRICT: .functype fma_vector_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f64x2.extract_lane $push2=, $0, 0
+; STRICT-NEXT: f64x2.extract_lane $push1=, $1, 0
+; STRICT-NEXT: f64x2.extract_lane $push0=, $2, 0
+; STRICT-NEXT: call $push3=, fma, $pop2, $pop1, $pop0
+; STRICT-NEXT: f64x2.splat $push4=, $pop3
+; STRICT-NEXT: f64x2.extract_lane $push7=, $0, 1
+; STRICT-NEXT: f64x2.extract_lane $push6=, $1, 1
+; STRICT-NEXT: f64x2.extract_lane $push5=, $2, 1
+; STRICT-NEXT: call $push8=, fma, $pop7, $pop6, $pop5
+; STRICT-NEXT: f64x2.replace_lane $push9=, $pop4, 1, $pop8
+; STRICT-NEXT: return $pop9
entry:
%fma = tail call fast <2 x double> @llvm.fma(<2 x double> %a, <2 x double> %b, <2 x double> %c)
ret <2 x double> %fma
}
-define float @fma_scalar_f32_seperate(float %a, float %b, float %c) {
-; CHECK-LABEL: fma_scalar_f32_seperate:
-; CHECK: .functype fma_scalar_f32_seperate (f32, f32, f32) -> (f32)
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: f32.mul $push0=, $1, $0
-; CHECK-NEXT: f32.add $push1=, $pop0, $2
-; CHECK-NEXT: return $pop1
+define float @fadd_fmul_scalar_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fadd_fmul_scalar_f32:
+; RELAXED: .functype fadd_fmul_scalar_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f32.mul $push0=, $1, $0
+; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fadd_fmul_scalar_f32:
+; STRICT: .functype fadd_fmul_scalar_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f32.mul $push0=, $1, $0
+; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
entry:
%mul.i = fmul fast float %b, %a
%add.i = fadd fast float %mul.i, %c
ret float %add.i
}
-define float @fma_scalar_f32_llvm(float %a, float %b, float %c) {
-; CHECK-LABEL: fma_scalar_f32_llvm:
-; CHECK: .functype fma_scalar_f32_llvm (f32, f32, f32) -> (f32)
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: call $push0=, fmaf, $0, $1, $2
-; CHECK-NEXT: return $pop0
+define float @fma_scalar_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fma_scalar_f32:
+; RELAXED: .functype fma_scalar_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: call $push0=, fmaf, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fma_scalar_f32:
+; STRICT: .functype fma_scalar_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: call $push0=, fmaf, $0, $1, $2
+; STRICT-NEXT: return $pop0
entry:
%fma = tail call fast float @llvm.fma(float %a, float %b, float %c)
ret float %fma
}
-define double @fma_scalar_f64_seperate(double %a, double %b, double %c) {
-; CHECK-LABEL: fma_scalar_f64_seperate:
-; CHECK: .functype fma_scalar_f64_seperate (f64, f64, f64) -> (f64)
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: f64.mul $push0=, $1, $0
-; CHECK-NEXT: f64.add $push1=, $pop0, $2
-; CHECK-NEXT: return $pop1
+define double @fadd_fmul_scalar_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fadd_fmul_scalar_f64:
+; RELAXED: .functype fadd_fmul_scalar_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: f64.mul $push0=, $1, $0
+; RELAXED-NEXT: f64.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fadd_fmul_scalar_f64:
+; STRICT: .functype fadd_fmul_scalar_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: f64.mul $push0=, $1, $0
+; STRICT-NEXT: f64.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
entry:
%mul.i = fmul fast double %b, %a
%add.i = fadd fast double %mul.i, %c
ret double %add.i
}
-define double @fma_scalar_f64_llvm(double %a, double %b, double %c) {
-; CHECK-LABEL: fma_scalar_f64_llvm:
-; CHECK: .functype fma_scalar_f64_llvm (f64, f64, f64) -> (f64)
-; CHECK-NEXT: # %bb.0: # %entry
-; CHECK-NEXT: call $push0=, fma, $0, $1, $2
-; CHECK-NEXT: return $pop0
+define double @fma_scalar_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fma_scalar_f64:
+; RELAXED: .functype fma_scalar_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT: # %bb.0: # %entry
+; RELAXED-NEXT: call $push0=, fma, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fma_scalar_f64:
+; STRICT: .functype fma_scalar_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-NEXT: call $push0=, fma, $0, $1, $2
+; STRICT-NEXT: return $pop0
entry:
%fma = tail call fast double @llvm.fma(double %a, double %b, double %c)
ret double %fma
>From 19b72e4882cdcc4b14d484e1b38aa7e813839232 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Fri, 11 Jul 2025 02:00:45 -0700
Subject: [PATCH 6/6] Resolve PR review
- Move PatFrag from ARM.td and WebAssembly.td to TargetSelectionDAG.td
- Rename and reformat tests to a more consistent pattern.
- Added todos for test.
---
.../include/llvm/Target/TargetSelectionDAG.td | 12 +
llvm/lib/Target/ARM/ARMInstrInfo.td | 5 -
.../WebAssembly/WebAssemblyInstrSIMD.td | 20 +-
.../CodeGen/WebAssembly/simd-relaxed-fma.ll | 257 ++++++++----------
4 files changed, 129 insertions(+), 165 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 9ac228110eb9c..4ff0aba5f6066 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1136,6 +1136,18 @@ def immAllOnesV : SDPatternOperator; // ISD::isConstantSplatVectorAllOnes
def immAllZerosV : SDPatternOperator; // ISD::isConstantSplatVectorAllZeros
// Other helper fragments.
+
+// An 'fmul' node which has contract flag
+def fmul_contract : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b),[{
+ return N->getFlags().hasAllowContract();
+}]>;
+
+// An 'fadd' node which can be contracted with fmul_contract into a fma or other relaxed instruction
+def fadd_contract : PatFrag<(ops node:$a, node:$b), (fadd node:$a, node:$b),[{
+ return N->getFlags().hasAllowContract();
+}]>;
+
+
def not : PatFrag<(ops node:$in), (xor node:$in, -1)>;
def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>;
def ineg : PatFrag<(ops node:$in), (sub 0, node:$in)>;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 1f5ba998970fc..46c776c0fafc4 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -486,11 +486,6 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
return hasNoVMLxHazardUse(N);
}]>;
-// An 'fadd' node which can be contracted into a fma
-def fadd_contract : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
- return N->getFlags().hasAllowContract();
-}]>;
-
def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 0e5546fa96fb2..7cbfd15d238fa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1542,27 +1542,11 @@ defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
-def fmul_fast : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b),[{
- return N->getFlags().hasAllowContract();
-}]>;
-
-
-def fadd_fast : PatFrag<(ops node:$a, node:$b), (fadd node:$a, node:$b),[{
- return N->getFlags().hasAllowContract();
-}]>;
-
-def : Pat<(fadd (v4f32 V128:$a), (fmul_fast (v4f32 V128:$b), (v4f32 V128:$c))),
- (MADD_F32x4 V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
-
-
-def : Pat<(fadd (v2f64 V128:$a), (fmul_fast (v2f64 V128:$b), (v2f64 V128:$c))),
- (MADD_F64x2 V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
-
-def : Pat<(fadd_fast (v4f32 V128:$a), (fmul (v4f32 V128:$b), (v4f32 V128:$c))),
+def : Pat<(fadd_contract (v4f32 V128:$a), (fmul_contract (v4f32 V128:$b), (v4f32 V128:$c))),
(MADD_F32x4 V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
-def : Pat<(fadd_fast (v2f64 V128:$a), (fmul (v2f64 V128:$b), (v2f64 V128:$c))),
+def : Pat<(fadd_contract (v2f64 V128:$a), (fmul_contract (v2f64 V128:$b), (v2f64 V128:$c))),
(MADD_F64x2 V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index cc6e713b67656..ab3edb49c5faf 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -4,68 +4,104 @@
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128, | FileCheck %s --check-prefix=STRICT
target triple = "wasm32"
-define <4 x float> @fadd_fmul_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; RELAXED-LABEL: fadd_fmul_vector_4xf32:
-; RELAXED: .functype fadd_fmul_vector_4xf32 (v128, v128, v128) -> (v128)
-; RELAXED-NEXT: # %bb.0: # %entry
+
+define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f64:
+; RELAXED: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64.mul $push0=, $1, $0
+; RELAXED-NEXT: f64.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fadd_fmul_contract_f64:
+; STRICT: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64.mul $push0=, $1, $0
+; STRICT-NEXT: f64.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+ %mul.i = fmul contract double %b, %a
+ %add.i = fadd contract double %mul.i, %c
+ ret double %add.i
+}
+
+define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fadd_fmul_contract_4xf32:
+; RELAXED: .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $1, $0
; RELAXED-NEXT: return $pop0
;
-; STRICT-LABEL: fadd_fmul_vector_4xf32:
-; STRICT: .functype fadd_fmul_vector_4xf32 (v128, v128, v128) -> (v128)
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fadd_fmul_contract_4xf32:
+; STRICT: .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: f32x4.mul $push0=, $1, $0
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
-entry:
- %mul.i = fmul fast <4 x float> %b, %a
- %add.i = fadd fast <4 x float> %mul.i, %c
+ %mul.i = fmul contract <4 x float> %b, %a
+ %add.i = fadd contract <4 x float> %mul.i, %c
ret <4 x float> %add.i
}
-define <4 x float> @fmuladd_fast_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; RELAXED-LABEL: fmuladd_fast_vector_4xf32:
-; RELAXED: .functype fmuladd_fast_vector_4xf32 (v128, v128, v128) -> (v128)
-; RELAXED-NEXT: # %bb.0: # %entry
+define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fadd_fmul_4xf32:
+; RELAXED: .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32x4.mul $push0=, $1, $0
+; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fadd_fmul_4xf32:
+; STRICT: .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32x4.mul $push0=, $1, $0
+; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+ %mul.i = fmul <4 x float> %b, %a
+ %add.i = fadd contract <4 x float> %mul.i, %c
+ ret <4 x float> %add.i
+}
+
+define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_contract_4xf32:
+; RELAXED: .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $0, $1
; RELAXED-NEXT: return $pop0
;
-; STRICT-LABEL: fmuladd_fast_vector_4xf32:
-; STRICT: .functype fmuladd_fast_vector_4xf32 (v128, v128, v128) -> (v128)
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fmuladd_contract_4xf32:
+; STRICT: .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: f32x4.mul $push0=, $0, $1
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
-entry:
- %fma = tail call fast <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+ %fma = tail call contract <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
-
-define <4 x float> @fmuladd_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; RELAXED-LABEL: fmuladd_vector_4xf32:
-; RELAXED: .functype fmuladd_vector_4xf32 (v128, v128, v128) -> (v128)
-; RELAXED-NEXT: # %bb.0: # %entry
+; TODO: This should also have relaxed_madd in RELAXED case
+define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_4xf32:
+; RELAXED: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: f32x4.mul $push0=, $0, $1
; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2
; RELAXED-NEXT: return $pop1
;
-; STRICT-LABEL: fmuladd_vector_4xf32:
-; STRICT: .functype fmuladd_vector_4xf32 (v128, v128, v128) -> (v128)
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fmuladd_4xf32:
+; STRICT: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: f32x4.mul $push0=, $0, $1
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
-entry:
%fma = tail call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
-define <4 x float> @fma_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; RELAXED-LABEL: fma_vector_4xf32:
-; RELAXED: .functype fma_vector_4xf32 (v128, v128, v128) -> (v128)
-; RELAXED-NEXT: # %bb.0: # %entry
+define <4 x float> @fma_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fma_contract_4xf32:
+; RELAXED: .functype fma_contract_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: f32x4.extract_lane $push2=, $0, 0
; RELAXED-NEXT: f32x4.extract_lane $push1=, $1, 0
; RELAXED-NEXT: f32x4.extract_lane $push0=, $2, 0
@@ -88,9 +124,9 @@ define <4 x float> @fma_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float>
; RELAXED-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
; RELAXED-NEXT: return $pop19
;
-; STRICT-LABEL: fma_vector_4xf32:
-; STRICT: .functype fma_vector_4xf32 (v128, v128, v128) -> (v128)
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fma_contract_4xf32:
+; STRICT: .functype fma_contract_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: f32x4.extract_lane $push2=, $0, 0
; STRICT-NEXT: f32x4.extract_lane $push1=, $1, 0
; STRICT-NEXT: f32x4.extract_lane $push0=, $2, 0
@@ -112,25 +148,24 @@ define <4 x float> @fma_vector_4xf32(<4 x float> %a, <4 x float> %b, <4 x float>
; STRICT-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
; STRICT-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
; STRICT-NEXT: return $pop19
-entry:
- %fma = tail call fast <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+ %fma = tail call contract <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
-define <8 x float> @fadd_fmul_vector_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
-; RELAXED-LABEL: fadd_fmul_vector_8xf32:
-; RELAXED: .functype fadd_fmul_vector_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
-; RELAXED-NEXT: # %bb.0: # %entry
+define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fadd_fmul_contract_8xf32:
+; RELAXED: .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $6, $4, $2
; RELAXED-NEXT: v128.store 16($0), $pop0
; RELAXED-NEXT: f32x4.relaxed_madd $push1=, $5, $3, $1
; RELAXED-NEXT: v128.store 0($0), $pop1
; RELAXED-NEXT: return
;
-; STRICT-LABEL: fadd_fmul_vector_8xf32:
-; STRICT: .functype fadd_fmul_vector_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fadd_fmul_contract_8xf32:
+; STRICT: .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: f32x4.mul $push0=, $4, $2
; STRICT-NEXT: f32x4.add $push1=, $pop0, $6
; STRICT-NEXT: v128.store 16($0), $pop1
@@ -138,139 +173,77 @@ define <8 x float> @fadd_fmul_vector_8xf32(<8 x float> %a, <8 x float> %b, <8 x
; STRICT-NEXT: f32x4.add $push3=, $pop2, $5
; STRICT-NEXT: v128.store 0($0), $pop3
; STRICT-NEXT: return
-entry:
- %mul.i = fmul fast <8 x float> %b, %a
- %add.i = fadd fast <8 x float> %mul.i, %c
+ %mul.i = fmul contract <8 x float> %b, %a
+ %add.i = fadd contract <8 x float> %mul.i, %c
ret <8 x float> %add.i
}
-define <2 x double> @fadd_fmul_vector(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; RELAXED-LABEL: fadd_fmul_vector:
-; RELAXED: .functype fadd_fmul_vector (v128, v128, v128) -> (v128)
-; RELAXED-NEXT: # %bb.0: # %entry
+define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fadd_fmul_contract_2xf64:
+; RELAXED: .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $2, $1, $0
; RELAXED-NEXT: return $pop0
;
-; STRICT-LABEL: fadd_fmul_vector:
-; STRICT: .functype fadd_fmul_vector (v128, v128, v128) -> (v128)
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fadd_fmul_contract_2xf64:
+; STRICT: .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: f64x2.mul $push0=, $1, $0
; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
-entry:
- %mul.i = fmul fast <2 x double> %b, %a
- %add.i = fadd fast <2 x double> %mul.i, %c
+ %mul.i = fmul contract <2 x double> %b, %a
+ %add.i = fadd contract <2 x double> %mul.i, %c
ret <2 x double> %add.i
}
-define <2 x double> @fma_vector_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; RELAXED-LABEL: fma_vector_2xf64:
-; RELAXED: .functype fma_vector_2xf64 (v128, v128, v128) -> (v128)
-; RELAXED-NEXT: # %bb.0: # %entry
-; RELAXED-NEXT: f64x2.extract_lane $push2=, $0, 0
-; RELAXED-NEXT: f64x2.extract_lane $push1=, $1, 0
-; RELAXED-NEXT: f64x2.extract_lane $push0=, $2, 0
-; RELAXED-NEXT: call $push3=, fma, $pop2, $pop1, $pop0
-; RELAXED-NEXT: f64x2.splat $push4=, $pop3
-; RELAXED-NEXT: f64x2.extract_lane $push7=, $0, 1
-; RELAXED-NEXT: f64x2.extract_lane $push6=, $1, 1
-; RELAXED-NEXT: f64x2.extract_lane $push5=, $2, 1
-; RELAXED-NEXT: call $push8=, fma, $pop7, $pop6, $pop5
-; RELAXED-NEXT: f64x2.replace_lane $push9=, $pop4, 1, $pop8
-; RELAXED-NEXT: return $pop9
-;
-; STRICT-LABEL: fma_vector_2xf64:
-; STRICT: .functype fma_vector_2xf64 (v128, v128, v128) -> (v128)
-; STRICT-NEXT: # %bb.0: # %entry
-; STRICT-NEXT: f64x2.extract_lane $push2=, $0, 0
-; STRICT-NEXT: f64x2.extract_lane $push1=, $1, 0
-; STRICT-NEXT: f64x2.extract_lane $push0=, $2, 0
-; STRICT-NEXT: call $push3=, fma, $pop2, $pop1, $pop0
-; STRICT-NEXT: f64x2.splat $push4=, $pop3
-; STRICT-NEXT: f64x2.extract_lane $push7=, $0, 1
-; STRICT-NEXT: f64x2.extract_lane $push6=, $1, 1
-; STRICT-NEXT: f64x2.extract_lane $push5=, $2, 1
-; STRICT-NEXT: call $push8=, fma, $pop7, $pop6, $pop5
-; STRICT-NEXT: f64x2.replace_lane $push9=, $pop4, 1, $pop8
-; STRICT-NEXT: return $pop9
-entry:
- %fma = tail call fast <2 x double> @llvm.fma(<2 x double> %a, <2 x double> %b, <2 x double> %c)
- ret <2 x double> %fma
-}
-
-
-define float @fadd_fmul_scalar_f32(float %a, float %b, float %c) {
-; RELAXED-LABEL: fadd_fmul_scalar_f32:
-; RELAXED: .functype fadd_fmul_scalar_f32 (f32, f32, f32) -> (f32)
-; RELAXED-NEXT: # %bb.0: # %entry
+define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f32:
+; RELAXED: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: f32.mul $push0=, $1, $0
; RELAXED-NEXT: f32.add $push1=, $pop0, $2
; RELAXED-NEXT: return $pop1
;
-; STRICT-LABEL: fadd_fmul_scalar_f32:
-; STRICT: .functype fadd_fmul_scalar_f32 (f32, f32, f32) -> (f32)
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fadd_fmul_contract_f32:
+; STRICT: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: f32.mul $push0=, $1, $0
; STRICT-NEXT: f32.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
-entry:
- %mul.i = fmul fast float %b, %a
- %add.i = fadd fast float %mul.i, %c
+ %mul.i = fmul contract float %b, %a
+ %add.i = fadd contract float %mul.i, %c
ret float %add.i
}
-define float @fma_scalar_f32(float %a, float %b, float %c) {
-; RELAXED-LABEL: fma_scalar_f32:
-; RELAXED: .functype fma_scalar_f32 (f32, f32, f32) -> (f32)
-; RELAXED-NEXT: # %bb.0: # %entry
+define float @fma_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fma_contract_f32:
+; RELAXED: .functype fma_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: call $push0=, fmaf, $0, $1, $2
; RELAXED-NEXT: return $pop0
;
-; STRICT-LABEL: fma_scalar_f32:
-; STRICT: .functype fma_scalar_f32 (f32, f32, f32) -> (f32)
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fma_contract_f32:
+; STRICT: .functype fma_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: call $push0=, fmaf, $0, $1, $2
; STRICT-NEXT: return $pop0
-entry:
- %fma = tail call fast float @llvm.fma(float %a, float %b, float %c)
+ %fma = tail call contract float @llvm.fma(float %a, float %b, float %c)
ret float %fma
}
-
-define double @fadd_fmul_scalar_f64(double %a, double %b, double %c) {
-; RELAXED-LABEL: fadd_fmul_scalar_f64:
-; RELAXED: .functype fadd_fmul_scalar_f64 (f64, f64, f64) -> (f64)
-; RELAXED-NEXT: # %bb.0: # %entry
-; RELAXED-NEXT: f64.mul $push0=, $1, $0
-; RELAXED-NEXT: f64.add $push1=, $pop0, $2
-; RELAXED-NEXT: return $pop1
-;
-; STRICT-LABEL: fadd_fmul_scalar_f64:
-; STRICT: .functype fadd_fmul_scalar_f64 (f64, f64, f64) -> (f64)
-; STRICT-NEXT: # %bb.0: # %entry
-; STRICT-NEXT: f64.mul $push0=, $1, $0
-; STRICT-NEXT: f64.add $push1=, $pop0, $2
-; STRICT-NEXT: return $pop1
-entry:
- %mul.i = fmul fast double %b, %a
- %add.i = fadd fast double %mul.i, %c
- ret double %add.i
-}
-
-define double @fma_scalar_f64(double %a, double %b, double %c) {
-; RELAXED-LABEL: fma_scalar_f64:
-; RELAXED: .functype fma_scalar_f64 (f64, f64, f64) -> (f64)
-; RELAXED-NEXT: # %bb.0: # %entry
+define double @fma_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fma_contract_f64:
+; RELAXED: .functype fma_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT: # %bb.0:
; RELAXED-NEXT: call $push0=, fma, $0, $1, $2
; RELAXED-NEXT: return $pop0
;
-; STRICT-LABEL: fma_scalar_f64:
-; STRICT: .functype fma_scalar_f64 (f64, f64, f64) -> (f64)
-; STRICT-NEXT: # %bb.0: # %entry
+; STRICT-LABEL: fma_contract_f64:
+; STRICT: .functype fma_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: call $push0=, fma, $0, $1, $2
; STRICT-NEXT: return $pop0
-entry:
- %fma = tail call fast double @llvm.fma(double %a, double %b, double %c)
+ %fma = tail call contract double @llvm.fma(double %a, double %b, double %c)
ret double %fma
}
More information about the llvm-commits
mailing list