[llvm] ea045b9 - [AArch64] Add patterns for scalar FMUL, FMULX
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 30 00:34:25 PDT 2023
Author: OverMighty
Date: 2023-06-30T08:34:20+01:00
New Revision: ea045b99da8ee236076fddb256bdac98681441fa
URL: https://github.com/llvm/llvm-project/commit/ea045b99da8ee236076fddb256bdac98681441fa
DIFF: https://github.com/llvm/llvm-project/commit/ea045b99da8ee236076fddb256bdac98681441fa.diff
LOG: [AArch64] Add patterns for scalar FMUL, FMULX
Scalar FMUL, FMULX instructions perform better or the same compared to indexed
FMUL, FMULX.
For example, the Arm Cortex-A55 Software Optimization Guide lists the following
instructions with a throughput of 2 IPC:
- "FP multiply" FMUL
- "ASIMD FP multiply" FMULX
whereas it lists the following with a throughput of 1 IPC:
- "ASIMD FP multiply, by element" FMUL, FMULX
The Arm Cortex-A510 Software Optimization Guide, however, does not separately
list "by element" variants of the "ASIMD FP multiply" instructions, which are
listed with the same throughput as the non-ASIMD ones.
Fixes #60817.
Differential Revision: https://reviews.llvm.org/D153207
Added:
Modified:
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
llvm/test/CodeGen/AArch64/arm64-fml-combines.ll
llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 362bade748828a..f16bb0b33b8721 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -8427,9 +8427,9 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
V128, v4f32, v8f16, OpNode>;
}
-let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
+ let mayRaiseFPException = 1, Uses = [FPCR] in {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
V64, V64,
@@ -8532,6 +8532,29 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
let Inst{11} = idx{0};
let Inst{21} = 0;
}
+ } // mayRaiseFPException = 1, Uses = [FPCR]
+
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : Pat<(f16 (OpNode
+ (f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
+ (f16 (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx)))),
+ (!cast<Instruction>(NAME # v1i16_indexed)
+ (EXTRACT_SUBREG V128:$Rn, hsub), V128:$Rm, VectorIndexH:$idx)>;
+ }
+
+ let Predicates = [HasNEON] in {
+ def : Pat<(f32 (OpNode
+ (f32 (vector_extract (v4f32 V128:$Rn), (i64 0))),
+ (f32 (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx)))),
+ (!cast<Instruction>(NAME # v1i32_indexed)
+ (EXTRACT_SUBREG V128:$Rn, ssub), V128:$Rm, VectorIndexS:$idx)>;
+
+ def : Pat<(f64 (OpNode
+ (f64 (vector_extract (v2f64 V128:$Rn), (i64 0))),
+ (f64 (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx)))),
+ (!cast<Instruction>(NAME # v1i64_indexed)
+ (EXTRACT_SUBREG V128:$Rn, dsub), V128:$Rm, VectorIndexD:$idx)>;
+ }
}
multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ec5f840e857e22..7199b80826d3a2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4443,6 +4443,33 @@ defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", any_fmul>;
}
defm FSUB : TwoOperandFPData<0b0011, "fsub", any_fsub>;
+multiclass FMULScalarFromIndexedLane0Patterns<string inst,
+ string inst_f16_suffix,
+ string inst_f32_suffix,
+ string inst_f64_suffix,
+ SDPatternOperator OpNode,
+ list<Predicate> preds = []> {
+ let Predicates = !listconcat(preds, [HasFullFP16]) in {
+ def : Pat<(f16 (OpNode (f16 FPR16:$Rn),
+ (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))))),
+ (!cast<Instruction>(inst # inst_f16_suffix)
+ FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub))>;
+ }
+ let Predicates = preds in {
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rn),
+ (f32 (vector_extract (v4f32 V128:$Rm), (i64 0))))),
+ (!cast<Instruction>(inst # inst_f32_suffix)
+ FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub))>;
+ def : Pat<(f64 (OpNode (f64 FPR64:$Rn),
+ (f64 (vector_extract (v2f64 V128:$Rm), (i64 0))))),
+ (!cast<Instruction>(inst # inst_f64_suffix)
+ FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub))>;
+ }
+}
+
+defm : FMULScalarFromIndexedLane0Patterns<"FMUL", "Hrr", "Srr", "Drr",
+ any_fmul>;
+
// Match reassociated forms of FNMUL.
def : Pat<(fmul (fneg FPR16:$a), (f16 FPR16:$b)),
(FNMULHrr FPR16:$a, FPR16:$b)>,
@@ -5248,6 +5275,10 @@ let Predicates = [HasRDM] in {
(SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
}
+defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
+ int_aarch64_neon_fmulx,
+ [HasNEONorSME]>;
+
def : InstAlias<"cmls $dst, $src1, $src2",
(CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"cmle $dst, $src1, $src2",
diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
index f12f3719e10cf9..e17a0a96955b19 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
@@ -17,7 +17,7 @@ entry:
; CHECK-LABEL: %for.body
; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
-; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
@@ -59,7 +59,7 @@ entry:
; CHECK-LABEL: %for.body
; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
-; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll
index c9a7ebb7c98bea..ce3581030646dc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll
@@ -10,7 +10,7 @@ entry:
; CHECK-LABEL: %for.body
; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
-; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
@@ -52,7 +52,7 @@ entry:
; CHECK-LABEL: %for.body
; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
-; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index 1f1d2326492b22..cb87ba9a4ed6c2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -8,6 +8,8 @@ declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare double @llvm.aarch64.neon.fmulx.f64(double, double)
+
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
@@ -2066,6 +2068,19 @@ entry:
ret <4 x float> %vmulx2.i
}
+define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmulx_lane_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmulx d0, d0, d1
+; CHECK-NEXT: ret
+entry:
+ %vget_lane = extractelement <1 x double> %a, i64 0
+ %vget_lane3 = extractelement <1 x double> %v, i64 0
+ %vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vget_lane3)
+ %vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
+ ret <1 x double> %vset_lane
+}
+
define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
; CHECK-LABEL: test_vmulxq_lane_f64:
; CHECK: // %bb.0: // %entry
@@ -2100,6 +2115,19 @@ entry:
ret <4 x float> %vmulx2.i
}
+define <1 x double> @test_vmulx_laneq_f64(<1 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulx_laneq_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmulx d0, d0, v1.d[1]
+; CHECK-NEXT: ret
+entry:
+ %vget_lane = extractelement <1 x double> %a, i64 0
+ %vgetq_lane = extractelement <2 x double> %v, i64 1
+ %vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vgetq_lane)
+ %vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
+ ret <1 x double> %vset_lane
+}
+
define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulxq_laneq_f64:
; CHECK: // %bb.0: // %entry
@@ -3560,7 +3588,7 @@ entry:
define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmul_laneq_f64_0:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmul d0, d0, v1.d[0]
+; CHECK-NEXT: fmul d0, d0, d1
; CHECK-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
@@ -3651,6 +3679,19 @@ entry:
ret <4 x float> %vmulx2.i
}
+define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulx_laneq_f64_0:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmulx d0, d0, d1
+; CHECK-NEXT: ret
+entry:
+ %vget_lane = extractelement <1 x double> %a, i64 0
+ %vgetq_lane = extractelement <2 x double> %v, i64 0
+ %vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vgetq_lane)
+ %vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
+ ret <1 x double> %vset_lane
+}
+
define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulxq_laneq_f64_0:
; CHECK: // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll b/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
index 5d5b940174c4b0..091cda89bfe403 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
@@ -1,8 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
-; CHECK-LABEL: test_fmul_lane_ss2S:
+define float @test_fmul_lane_ss2S_0(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmul_lane_ss2S_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %tmp1 = extractelement <2 x float> %v, i32 0
+ %tmp2 = fmul float %a, %tmp1
+ ret float %tmp2
+}
+
+define float @test_fmul_lane_ss2S_1(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmul_lane_ss2S_1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul s0, s0, v1.s[1]
@@ -12,8 +23,8 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
ret float %tmp2;
}
-define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
-; CHECK-LABEL: test_fmul_lane_ss2S_swap:
+define float @test_fmul_lane_ss2S_1_swap(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmul_lane_ss2S_1_swap:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul s0, s0, v1.s[1]
@@ -23,9 +34,18 @@ define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
ret float %tmp2;
}
+define float @test_fmul_lane_ss4S_0(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmul_lane_ss4S_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %tmp1 = extractelement <4 x float> %v, i32 0
+ %tmp2 = fmul float %a, %tmp1
+ ret float %tmp2
+}
-define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
-; CHECK-LABEL: test_fmul_lane_ss4S:
+define float @test_fmul_lane_ss4S_3(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmul_lane_ss4S_3:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul s0, s0, v1.s[3]
; CHECK-NEXT: ret
@@ -34,8 +54,8 @@ define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
ret float %tmp2;
}
-define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
-; CHECK-LABEL: test_fmul_lane_ss4S_swap:
+define float @test_fmul_lane_ss4S_3_swap(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmul_lane_ss4S_3_swap:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul s0, s0, v1.s[3]
; CHECK-NEXT: ret
@@ -56,9 +76,18 @@ define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
}
+define double @test_fmul_lane_dd2D_0(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmul_lane_dd2D_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul d0, d0, d1
+; CHECK-NEXT: ret
+ %tmp1 = extractelement <2 x double> %v, i32 0
+ %tmp2 = fmul double %a, %tmp1
+ ret double %tmp2
+}
-define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
-; CHECK-LABEL: test_fmul_lane_dd2D:
+define double @test_fmul_lane_dd2D_1(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmul_lane_dd2D_1:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul d0, d0, v1.d[1]
; CHECK-NEXT: ret
@@ -68,8 +97,8 @@ define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
}
-define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
-; CHECK-LABEL: test_fmul_lane_dd2D_swap:
+define double @test_fmul_lane_dd2D_1_swap(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmul_lane_dd2D_1_swap:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul d0, d0, v1.d[1]
; CHECK-NEXT: ret
@@ -80,8 +109,19 @@ define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
declare float @llvm.aarch64.neon.fmulx.f32(float, float)
-define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
-; CHECK-LABEL: test_fmulx_lane_f32:
+define float @test_fmulx_lane_f32_0(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmulx_lane_f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: fmulx s0, s0, s1
+; CHECK-NEXT: ret
+ %tmp1 = extractelement <2 x float> %v, i32 0
+ %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
+ ret float %tmp2;
+}
+
+define float @test_fmulx_lane_f32_1(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmulx_lane_f32_1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmulx s0, s0, v1.s[1]
@@ -91,8 +131,18 @@ define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
ret float %tmp2;
}
-define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
-; CHECK-LABEL: test_fmulx_laneq_f32:
+define float @test_fmulx_laneq_f32_0(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmulx_laneq_f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmulx s0, s0, s1
+; CHECK-NEXT: ret
+ %tmp1 = extractelement <4 x float> %v, i32 0
+ %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
+ ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32_3(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmulx_laneq_f32_3:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx s0, s0, v1.s[3]
; CHECK-NEXT: ret
@@ -101,8 +151,8 @@ define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
ret float %tmp2;
}
-define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
-; CHECK-LABEL: test_fmulx_laneq_f32_swap:
+define float @test_fmulx_laneq_f32_3_swap(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmulx_laneq_f32_3_swap:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx s0, s0, v1.s[3]
; CHECK-NEXT: ret
@@ -126,7 +176,7 @@ define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
; CHECK-LABEL: test_fmulx_laneq_f64_0:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmulx d0, d0, v1.d[0]
+; CHECK-NEXT: fmulx d0, d0, d1
; CHECK-NEXT: ret
%tmp1 = extractelement <2 x double> %v, i32 0
%tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
@@ -154,3 +204,27 @@ define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
ret double %tmp2;
}
+define float @test_fmulx_horizontal_f32(<2 x float> %v) {
+; CHECK-LABEL: test_fmulx_horizontal_f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmulx s0, s0, v0.s[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = extractelement <2 x float> %v, i32 0
+ %1 = extractelement <2 x float> %v, i32 1
+ %2 = call float @llvm.aarch64.neon.fmulx.f32(float %0, float %1)
+ ret float %2
+}
+
+define double @test_fmulx_horizontal_f64(<2 x double> %v) {
+; CHECK-LABEL: test_fmulx_horizontal_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmulx d0, d0, v0.d[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = extractelement <2 x double> %v, i32 0
+ %1 = extractelement <2 x double> %v, i32 1
+ %2 = call double @llvm.aarch64.neon.fmulx.f64(double %0, double %1)
+ ret double %2
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
index 72e5b0eef9d02b..cb0a9f5236b5d1 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
@@ -11,7 +11,7 @@ define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov h3, v0.h[1]
; CHECK-NEXT: mov h2, v1.h[1]
-; CHECK-NEXT: fmul h4, h2, v0.h[0]
+; CHECK-NEXT: fmul h4, h0, v1.h[1]
; CHECK-NEXT: fnmul h2, h3, h2
; CHECK-NEXT: fmla h4, h3, v1.h[0]
; CHECK-NEXT: fmla h2, h0, v1.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index c92ea2fcfe6a82..5d956332de977a 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -228,11 +228,11 @@ entry:
ret <8 x half> %mul
}
-define dso_local half @t_vmulh_lane_f16(half %a, <4 x half> %c, i32 %lane) {
-; CHECK-LABEL: t_vmulh_lane_f16:
+define dso_local half @t_vmulh_lane0_f16(half %a, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vmulh_lane0_f16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: fmul h0, h0, v1.h[0]
+; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: ret
entry:
%0 = extractelement <4 x half> %c, i32 0
@@ -240,10 +240,22 @@ entry:
ret half %1
}
-define dso_local half @t_vmulh_laneq_f16(half %a, <8 x half> %c, i32 %lane) {
-; CHECK-LABEL: t_vmulh_laneq_f16:
+define dso_local half @t_vmulh_lane3_f16(half %a, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vmulh_lane3_f16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmul h0, h0, v1.h[0]
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: fmul h0, h0, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %0 = extractelement <4 x half> %c, i32 3
+ %1 = fmul half %0, %a
+ ret half %1
+}
+
+define dso_local half @t_vmulh_laneq0_f16(half %a, <8 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vmulh_laneq0_f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: ret
entry:
%0 = extractelement <8 x half> %c, i32 0
@@ -251,6 +263,17 @@ entry:
ret half %1
}
+define dso_local half @t_vmulh_laneq7_f16(half %a, <8 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vmulh_laneq7_f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmul h0, h0, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %0 = extractelement <8 x half> %c, i32 7
+ %1 = fmul half %0, %a
+ ret half %1
+}
+
define dso_local half @t_vmulx_f16(half %a, half %b) {
; CHECK-LABEL: t_vmulx_f16:
; CHECK: // %bb.0: // %entry
@@ -261,8 +284,20 @@ entry:
ret half %fmulx.i
}
-define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) {
-; CHECK-LABEL: t_vmulxh_lane_f16:
+define dso_local half @t_vmulxh_lane0_f16(half %a, <4 x half> %b) {
+; CHECK-LABEL: t_vmulxh_lane0_f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: fmulx h0, h0, h1
+; CHECK-NEXT: ret
+entry:
+ %extract = extractelement <4 x half> %b, i32 0
+ %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
+ ret half %fmulx.i
+}
+
+define dso_local half @t_vmulxh_lane3_f16(half %a, <4 x half> %b, i32 %lane) {
+; CHECK-LABEL: t_vmulxh_lane3_f16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmulx h0, h0, v1.h[3]
@@ -319,8 +354,19 @@ entry:
ret <8 x half> %vmulx2.i
}
-define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) {
-; CHECK-LABEL: t_vmulxh_laneq_f16:
+define dso_local half @t_vmulxh_laneq0_f16(half %a, <8 x half> %b) {
+; CHECK-LABEL: t_vmulxh_laneq0_f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmulx h0, h0, h1
+; CHECK-NEXT: ret
+entry:
+ %extract = extractelement <8 x half> %b, i32 0
+ %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
+ ret half %fmulx.i
+}
+
+define dso_local half @t_vmulxh_laneq7_f16(half %a, <8 x half> %b, i32 %lane) {
+; CHECK-LABEL: t_vmulxh_laneq7_f16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmulx h0, h0, v1.h[7]
; CHECK-NEXT: ret
@@ -418,3 +464,16 @@ entry:
%1 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
ret half %1
}
+
+define half @test_fmulx_horizontal_f16(<2 x half> %v) {
+; CHECK-LABEL: test_fmulx_horizontal_f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmulx h0, h0, v0.h[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = extractelement <2 x half> %v, i32 0
+ %1 = extractelement <2 x half> %v, i32 1
+ %2 = call half @llvm.aarch64.neon.fmulx.f16(half %0, half %1)
+ ret half %2
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
index 854e340a4ea015..ce7ae1e426bdac 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
@@ -70,15 +70,15 @@ define float @test_v16f32(<16 x float> %a) nounwind {
; CHECK-NEXT: fmul s4, s0, v0.s[1]
; CHECK-NEXT: fmul s4, s4, v0.s[2]
; CHECK-NEXT: fmul s0, s4, v0.s[3]
-; CHECK-NEXT: fmul s0, s0, v1.s[0]
+; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: fmul s0, s0, v1.s[1]
; CHECK-NEXT: fmul s0, s0, v1.s[2]
; CHECK-NEXT: fmul s0, s0, v1.s[3]
-; CHECK-NEXT: fmul s0, s0, v2.s[0]
+; CHECK-NEXT: fmul s0, s0, s2
; CHECK-NEXT: fmul s0, s0, v2.s[1]
; CHECK-NEXT: fmul s0, s0, v2.s[2]
; CHECK-NEXT: fmul s0, s0, v2.s[3]
-; CHECK-NEXT: fmul s0, s0, v3.s[0]
+; CHECK-NEXT: fmul s0, s0, s3
; CHECK-NEXT: fmul s0, s0, v3.s[1]
; CHECK-NEXT: fmul s0, s0, v3.s[2]
; CHECK-NEXT: fmul s0, s0, v3.s[3]
More information about the llvm-commits
mailing list