[llvm] b670da7 - [AArch64] Allow strict opcodes in indexed fmul and fma patterns

Thu Feb 17 05:12:32 PST 2022

Author: John Brawn
Date: 2022-02-17T13:11:54Z
New Revision: b670da798d352c2edcee1d5ad832905b3923c8f3

URL: https://github.com/llvm/llvm-project/commit/b670da798d352c2edcee1d5ad832905b3923c8f3
DIFF: https://github.com/llvm/llvm-project/commit/b670da798d352c2edcee1d5ad832905b3923c8f3.diff

LOG: [AArch64] Allow strict opcodes in indexed fmul and fma patterns

Using an indexed version instead of a non-indexed version doesn't
change anything with regards to exceptions or rounding.

Differential Revision: https://reviews.llvm.org/D118487

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/arm64-vmul.ll
    llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 0f88fc950eb4..53a06c2b9e8e 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6280,18 +6280,18 @@ let hasSideEffects = 0 in {
 // On the other hand, there are quite a few valid combinatorial options due to
 // the commutativity of multiplication and the fact that (-x) * y = x * (-y).
 defm : SIMDFPIndexedTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+           TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)>>;
 defm : SIMDFPIndexedTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+           TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)>>;
 
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+           TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+           TriOpFrag<(any_fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+           TriOpFrag<(any_fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+           TriOpFrag<(any_fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
 
 multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
   // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
@@ -6370,22 +6370,22 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
 }
 
 defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+           TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >;
 defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+           TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)> >;
 
 defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
 defm FMUL  : SIMDFPIndexed<0, 0b1001, "fmul", any_fmul>;
 
-def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+def : Pat<(v2f32 (any_fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv2i32_indexed V64:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
             (i64 0))>;
-def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+def : Pat<(v4f32 (any_fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv4i32_indexed V128:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
             (i64 0))>;
-def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+def : Pat<(v2f64 (any_fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
           (FMULv2i64_indexed V128:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
             (i64 0))>;

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 30a1bc5d8c1d..482a1c5941e2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -845,6 +845,90 @@ entry:
   ret <2 x double> %fmla1
 }
 
+define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+; CHECK-LABEL: fmls_indexed_2s_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmls.2s v0, v2, v1[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fneg <2 x float> %c
+  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+; CHECK-LABEL: fmls_indexed_4s_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls.4s v0, v2, v1[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fneg <4 x float> %c
+  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+  %fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
+; CHECK-LABEL: fmls_indexed_2d_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls.2d v0, v2, v1[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fneg <2 x double> %c
+  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2s_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    fmla.2s v0, v1, v2[0]
+; CHECK-NEXT:    ret
+entry:
+  %v1 = insertelement <2 x float> undef, float %c, i32 0
+  %v2 = insertelement <2 x float> %v1, float %c, i32 1
+  %fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_4s_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    fmla.4s v0, v1, v2[0]
+; CHECK-NEXT:    ret
+entry:
+  %v1 = insertelement <4 x float> undef, float %c, i32 0
+  %v2 = insertelement <4 x float> %v1, float %c, i32 1
+  %v3 = insertelement <4 x float> %v2, float %c, i32 2
+  %v4 = insertelement <4 x float> %v3, float %c, i32 3
+  %fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2d_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla.2d v0, v1, v2[0]
+; CHECK-NEXT:    ret
+entry:
+  %v1 = insertelement <2 x double> undef, double %c, i32 0
+  %v2 = insertelement <2 x double> %v1, double %c, i32 1
+  %fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %fmla1
+}
+
+attributes #0 = { strictfp }
+
+declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
+
 define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ; CHECK-LABEL: mul_4h:
 ; CHECK:       // %bb.0:

diff  --git a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 32f59626b381..5ae08cf20c39 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -1,7 +1,11 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
+attributes #0 = { strictfp }
+
 declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
 
 define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
   ; CHECK-LABEL: test_fmla_ss4S
@@ -106,3 +110,105 @@ define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
   ret double %tmp3
 }
 
+define float @test_fmla_ss4S_strict(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss4S_strict
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_swap_strict(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss4S_swap_strict
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %a, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_strict(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss2S_strict
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp2
+}
+
+define double @test_fmla_ddD_strict(double %a, double %b, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmla_ddD_strict
+  ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_strict(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmla_dd2D_strict
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_swap_strict(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmla_dd2D_swap_strict
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp2
+}
+
+define float @test_fmls_ss4S_strict(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss4S_strict
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fneg float %tmp1
+  %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp3
+}
+
+define float @test_fmls_ss4S_swap_strict(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss4S_swap_strict
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fneg float %tmp1
+  %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %tmp2, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp3
+}
+
+define float @test_fmls_ss2S_strict(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss2S_strict
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fneg float %tmp1
+  %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp3
+}
+
+define double @test_fmls_ddD_strict(double %a, double %b, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmls_ddD_strict
+  ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = fneg double %tmp1
+  %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D_strict(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmls_dd2D_strict
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fneg double %tmp1
+  %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D_swap_strict(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmls_dd2D_swap_strict
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fneg double %tmp1
+  %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %tmp2, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp3
+}
+