[llvm] be881e2 - [AArch64] FMLA/FMLS patterns improvement.
Pavel Iliin via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 21 10:24:55 PDT 2020
Author: Pavel Iliin
Date: 2020-04-21T18:23:21+01:00
New Revision: be881e2831735d6879ee43710f5a4d1c8d50c615
URL: https://github.com/llvm/llvm-project/commit/be881e2831735d6879ee43710f5a4d1c8d50c615
DIFF: https://github.com/llvm/llvm-project/commit/be881e2831735d6879ee43710f5a4d1c8d50c615.diff
LOG: [AArch64] FMLA/FMLS patterns improvement.
FMLA/FMLS f16 indexed patterns added.
Fixes https://bugs.llvm.org/show_bug.cgi?id=45467
Removed redundant v2f32 vector_extract indexed pattern since
Instruction Selection is able to match v4f32 instead.
Added:
Modified:
clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
Removed:
################################################################################
diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
index d7830f71e2de..b72bd3f977dd 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
@@ -89,7 +89,7 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
// COMMONIR: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]])
// CONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <4 x half> [[FMLA]]
float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
return vfma_lane_f16(a, b, c, 3);
@@ -105,7 +105,7 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <8 x half> [[FMLA]]
float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
return vfmaq_lane_f16(a, b, c, 3);
@@ -137,7 +137,7 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <8 x half> [[FMLA]]
float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
return vfmaq_laneq_f16(a, b, c, 7);
@@ -150,7 +150,7 @@ float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
// COMMONIR: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3
// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a)
// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <4 x half> [[FMA]]
float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
return vfma_n_f16(a, b, c);
@@ -167,7 +167,7 @@ float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
// COMMONIR: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7
// UNCONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a)
// CONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <8 x half> [[FMA]]
float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
return vfmaq_n_f16(a, b, c);
@@ -177,7 +177,7 @@ float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
// COMMONIR: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3
// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a)
// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmadd h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}
+// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret half [[FMA]]
float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
return vfmah_lane_f16(a, b, c, 3);
@@ -187,7 +187,7 @@ float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
// COMMONIR: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7
// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a)
// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmadd h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}
+// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret half [[FMA]]
float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
return vfmah_laneq_f16(a, b, c, 7);
@@ -195,7 +195,6 @@ float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
// COMMON-LABEL: test_vfms_lane_f16
// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b
-// CHECK-ASM: fneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8>
// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
@@ -205,7 +204,7 @@ float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
// COMMONIR: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]])
// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <4 x half> [[FMA]]
float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
return vfms_lane_f16(a, b, c, 3);
@@ -213,7 +212,6 @@ float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
// COMMON-LABEL: test_vfmsq_lane_f16
// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b
-// CHECK-ASM: fneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8>
// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
@@ -223,7 +221,7 @@ float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <8 x half> [[FMLA]]
float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
return vfmsq_lane_f16(a, b, c, 3);
@@ -259,7 +257,7 @@ float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <8 x half> [[FMLA]]
float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
return vfmsq_laneq_f16(a, b, c, 7);
@@ -267,14 +265,13 @@ float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
// COMMON-LABEL: test_vfms_n_f16
// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b
-// CHECK-ASM: fneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
// COMMONIR: [[TMP0:%.*]] = insertelement <4 x half> undef, half %c, i32 0
// COMMONIR: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1
// COMMONIR: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2
// COMMONIR: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3
// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a)
// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <4 x half> [[FMA]]
float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
return vfms_n_f16(a, b, c);
@@ -282,7 +279,6 @@ float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
// COMMON-LABEL: test_vfmsq_n_f16
// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b
-// CHECK-ASM: fneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
// COMMONIR: [[TMP0:%.*]] = insertelement <8 x half> undef, half %c, i32 0
// COMMONIR: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1
// COMMONIR: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2
@@ -293,7 +289,7 @@ float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
// COMMONIR: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7
// UNCONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a)
// CONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret <8 x half> [[FMA]]
float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
return vfmsq_n_f16(a, b, c);
@@ -311,7 +307,7 @@ float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
// COMMONIR: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3
// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a)
// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmadd h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}
+// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret half [[FMA]]
float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) {
return vfmsh_lane_f16(a, b, c, 3);
@@ -329,7 +325,7 @@ float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) {
// COMMONIR: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7
// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a)
// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM: fmadd h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}
+// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}]
// COMMONIR: ret half [[FMA]]
float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
return vfmsh_laneq_f16(a, b, c, 7);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 6b23c7cfd17a..9d254bdf6940 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -8052,6 +8052,34 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
}
multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ // Patterns for f16: DUPLANE, DUP scalar and vector_extract.
+ def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
+ (AArch64duplane16 (v8f16 V128:$Rm),
+ VectorIndexH:$idx))),
+ (!cast<Instruction>(INST # "v8i16_indexed")
+ V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexH:$idx)>;
+ def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
+ (AArch64dup (f16 FPR16Op:$Rm)))),
+ (!cast<Instruction>(INST # "v8i16_indexed") V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR16Op:$Rm, hsub), (i64 0))>;
+
+ def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
+ (AArch64duplane16 (v8f16 V128:$Rm),
+ VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # "v4i16_indexed")
+ V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
+ (AArch64dup (f16 FPR16Op:$Rm)))),
+ (!cast<Instruction>(INST # "v4i16_indexed") V64:$Rd, V64:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR16Op:$Rm, hsub), (i64 0))>;
+
+ def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn),
+ (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx))),
+ (!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
+ V128:$Rm, VectorIndexH:$idx)>;
+ } // Predicates = [HasNEON, HasFullFP16]
+
// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(AArch64duplane32 (v4f32 V128:$Rm),
@@ -8086,15 +8114,11 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
(!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
- // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+ // Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s
def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
(vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
(!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
V128:$Rm, VectorIndexS:$idx)>;
- def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
- (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
- (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
- (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
// 1 variant for 64-bit scalar version: extract from .1d or from .2d
def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index 847a51f2aed4..1b0c7c346887 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -14,8 +14,7 @@ define dso_local <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: dup v2.4h, v2.h[0]
-; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
+; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0]
; CHECK-NEXT: ret
entry:
%lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
@@ -29,8 +28,7 @@ define dso_local <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: dup v2.8h, v2.h[0]
-; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0]
; CHECK-NEXT: ret
entry:
%lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer
@@ -43,8 +41,7 @@ define dso_local <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x
; CHECK: .Lt_vfma_laneq_f16$local:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
-; CHECK-NEXT: dup v2.4h, v2.h[0]
-; CHECK-NEXT: fmla v0.4h, v1.4h, v2.4h
+; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0]
; CHECK-NEXT: ret
entry:
%lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer
@@ -57,8 +54,7 @@ define dso_local <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8
; CHECK: .Lt_vfmaq_laneq_f16$local:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
-; CHECK-NEXT: dup v2.8h, v2.h[0]
-; CHECK-NEXT: fmla v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0]
; CHECK-NEXT: ret
entry:
%lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer
@@ -72,8 +68,7 @@ define dso_local <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c)
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2
-; CHECK-NEXT: dup v2.4h, v2.h[0]
-; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
+; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0]
; CHECK-NEXT: ret
entry:
%vecinit = insertelement <4 x half> undef, half %c, i32 0
@@ -88,8 +83,7 @@ define dso_local <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2
-; CHECK-NEXT: dup v2.8h, v2.h[0]
-; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0]
; CHECK-NEXT: ret
entry:
%vecinit = insertelement <8 x half> undef, half %c, i32 0
@@ -104,7 +98,7 @@ define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %la
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: fmadd h0, h1, h2, h0
+; CHECK-NEXT: fmla h0, h1, v2.h[0]
; CHECK-NEXT: ret
entry:
%extract = extractelement <4 x half> %c, i32 0
@@ -117,7 +111,7 @@ define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %l
; CHECK: .Lt_vfmah_laneq_f16$local:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
-; CHECK-NEXT: fmadd h0, h1, h2, h0
+; CHECK-NEXT: fmla h0, h1, v2.h[0]
; CHECK-NEXT: ret
entry:
%extract = extractelement <8 x half> %c, i32 0
@@ -131,9 +125,7 @@ define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: fneg v1.4h, v1.4h
-; CHECK-NEXT: dup v2.4h, v2.h[0]
-; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
+; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0]
; CHECK-NEXT: ret
entry:
%sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
@@ -148,9 +140,7 @@ define dso_local <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: fneg v1.8h, v1.8h
-; CHECK-NEXT: dup v2.8h, v2.h[0]
-; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0]
; CHECK-NEXT: ret
entry:
%sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
@@ -164,8 +154,7 @@ define dso_local <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x
; CHECK: .Lt_vfms_laneq_f16$local:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
-; CHECK-NEXT: dup v2.4h, v2.h[0]
-; CHECK-NEXT: fmls v0.4h, v2.4h, v1.4h
+; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0]
; CHECK-NEXT: ret
entry:
%sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
@@ -179,8 +168,7 @@ define dso_local <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8
; CHECK: .Lt_vfmsq_laneq_f16$local:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
-; CHECK-NEXT: dup v2.8h, v2.h[0]
-; CHECK-NEXT: fmls v0.8h, v2.8h, v1.8h
+; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0]
; CHECK-NEXT: ret
entry:
%sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
@@ -195,9 +183,7 @@ define dso_local <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c)
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2
-; CHECK-NEXT: fneg v1.4h, v1.4h
-; CHECK-NEXT: dup v2.4h, v2.h[0]
-; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
+; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0]
; CHECK-NEXT: ret
entry:
%sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
@@ -213,9 +199,7 @@ define dso_local <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2
-; CHECK-NEXT: fneg v1.8h, v1.8h
-; CHECK-NEXT: dup v2.8h, v2.h[0]
-; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0]
; CHECK-NEXT: ret
entry:
%sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
@@ -231,7 +215,7 @@ define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %la
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: fmsub h0, h1, h2, h0
+; CHECK-NEXT: fmls h0, h1, v2.h[0]
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
@@ -245,7 +229,7 @@ define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %l
; CHECK: .Lt_vfmsh_laneq_f16$local:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
-; CHECK-NEXT: fmsub h0, h1, h2, h0
+; CHECK-NEXT: fmls h0, h1, v2.h[0]
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
@@ -438,8 +422,7 @@ define dso_local half @t_vfmah_lane3_f16(half %a, half %b, <4 x half> %c) {
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov h2, v2.h[3]
-; CHECK-NEXT: fmadd h0, h1, h2, h0
+; CHECK-NEXT: fmla h0, h1, v2.h[3]
; CHECK-NEXT: ret
entry:
%extract = extractelement <4 x half> %c, i32 3
@@ -452,8 +435,7 @@ define dso_local half @t_vfmah_laneq7_f16(half %a, half %b, <8 x half> %c) {
; CHECK: .Lt_vfmah_laneq7_f16$local:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
-; CHECK-NEXT: mov h2, v2.h[7]
-; CHECK-NEXT: fmadd h0, h1, h2, h0
+; CHECK-NEXT: fmla h0, h1, v2.h[7]
; CHECK-NEXT: ret
entry:
%extract = extractelement <8 x half> %c, i32 7
@@ -467,8 +449,7 @@ define dso_local half @t_vfmsh_lane3_f16(half %a, half %b, <4 x half> %c) {
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov h2, v2.h[3]
-; CHECK-NEXT: fmsub h0, h1, h2, h0
+; CHECK-NEXT: fmls h0, h1, v2.h[3]
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
@@ -482,8 +463,7 @@ define dso_local half @t_vfmsh_laneq7_f16(half %a, half %b, <8 x half> %c) {
; CHECK: .Lt_vfmsh_laneq7_f16$local:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
-; CHECK-NEXT: mov h2, v2.h[7]
-; CHECK-NEXT: fmsub h0, h1, h2, h0
+; CHECK-NEXT: fmls h0, h1, v2.h[7]
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
@@ -498,8 +478,7 @@ define dso_local half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x ha
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: fadd v2.4h, v2.4h, v3.4h
-; CHECK-NEXT: mov h2, v2.h[3]
-; CHECK-NEXT: fmadd h0, h1, h2, h0
+; CHECK-NEXT: fmla h0, h1, v2.h[3]
; CHECK-NEXT: ret
entry:
%0 = fadd <4 x half> %c, %d
More information about the llvm-commits
mailing list