[llvm] 7c3cda5 - [AArch64][SVE] Prefer SIMD&FP variant of clast[ab]
Cullen Rhodes via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 13 01:53:57 PDT 2022
Author: Cullen Rhodes
Date: 2022-07-13T08:53:36Z
New Revision: 7c3cda551ac702de4eb8899180aa715896020d43
URL: https://github.com/llvm/llvm-project/commit/7c3cda551ac702de4eb8899180aa715896020d43
DIFF: https://github.com/llvm/llvm-project/commit/7c3cda551ac702de4eb8899180aa715896020d43.diff
LOG: [AArch64][SVE] Prefer SIMD&FP variant of clast[ab]
The scalar variant with GPR source/dest has considerably higher latency
than the SIMD&FP scalar variant across a variety of micro-architectures:
Core Scalar SIMD&FP
--------------------------------
Neoverse V1 9 cyc 3 cyc
Neoverse N2 8 cyc 3 cyc
Cortex A510 8 cyc 4 cyc
A64FX 29 cyc 6 cyc
Added:
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
Modified:
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Removed:
################################################################################
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
index c6cf55accf820..d3d273a6fcbc4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
@@ -215,14 +215,20 @@ int8_t test_svclasta_n_s8(svbool_t pg, int8_t fallback, svint8_t data)
// CHECK-LABEL: @test_svclasta_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CHECK-NEXT: ret i16 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CHECK-NEXT: ret i16 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclasta_n_s16u10__SVBool_tsu11__SVInt16_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i16 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CPP-CHECK-NEXT: ret i16 [[TMP4]]
//
int16_t test_svclasta_n_s16(svbool_t pg, int16_t fallback, svint16_t data)
{
@@ -232,14 +238,20 @@ int16_t test_svclasta_n_s16(svbool_t pg, int16_t fallback, svint16_t data)
// CHECK-LABEL: @test_svclasta_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CHECK-NEXT: ret i32 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT: ret i32 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclasta_n_s32u10__SVBool_tiu11__SVInt32_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i32 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CPP-CHECK-NEXT: ret i32 [[TMP4]]
//
int32_t test_svclasta_n_s32(svbool_t pg, int32_t fallback, svint32_t data)
{
@@ -249,14 +261,20 @@ int32_t test_svclasta_n_s32(svbool_t pg, int32_t fallback, svint32_t data)
// CHECK-LABEL: @test_svclasta_n_s64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CHECK-NEXT: ret i64 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT: ret i64 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclasta_n_s64u10__SVBool_tlu11__SVInt64_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i64 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CPP-CHECK-NEXT: ret i64 [[TMP4]]
//
int64_t test_svclasta_n_s64(svbool_t pg, int64_t fallback, svint64_t data)
{
@@ -281,14 +299,20 @@ uint8_t test_svclasta_n_u8(svbool_t pg, uint8_t fallback, svuint8_t data)
// CHECK-LABEL: @test_svclasta_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CHECK-NEXT: ret i16 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CHECK-NEXT: ret i16 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclasta_n_u16u10__SVBool_ttu12__SVUint16_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i16 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CPP-CHECK-NEXT: ret i16 [[TMP4]]
//
uint16_t test_svclasta_n_u16(svbool_t pg, uint16_t fallback, svuint16_t data)
{
@@ -298,14 +322,20 @@ uint16_t test_svclasta_n_u16(svbool_t pg, uint16_t fallback, svuint16_t data)
// CHECK-LABEL: @test_svclasta_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CHECK-NEXT: ret i32 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT: ret i32 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclasta_n_u32u10__SVBool_tju12__SVUint32_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i32 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CPP-CHECK-NEXT: ret i32 [[TMP4]]
//
uint32_t test_svclasta_n_u32(svbool_t pg, uint32_t fallback, svuint32_t data)
{
@@ -315,14 +345,20 @@ uint32_t test_svclasta_n_u32(svbool_t pg, uint32_t fallback, svuint32_t data)
// CHECK-LABEL: @test_svclasta_n_u64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CHECK-NEXT: ret i64 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT: ret i64 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclasta_n_u64u10__SVBool_tmu12__SVUint64_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i64 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CPP-CHECK-NEXT: ret i64 [[TMP4]]
//
uint64_t test_svclasta_n_u64(svbool_t pg, uint64_t fallback, svuint64_t data)
{
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
index 1473d0ab62070..1b866c43ae167 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
@@ -215,14 +215,20 @@ int8_t test_svclastb_n_s8(svbool_t pg, int8_t fallback, svint8_t data)
// CHECK-LABEL: @test_svclastb_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CHECK-NEXT: ret i16 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CHECK-NEXT: ret i16 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclastb_n_s16u10__SVBool_tsu11__SVInt16_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i16 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CPP-CHECK-NEXT: ret i16 [[TMP4]]
//
int16_t test_svclastb_n_s16(svbool_t pg, int16_t fallback, svint16_t data)
{
@@ -232,14 +238,20 @@ int16_t test_svclastb_n_s16(svbool_t pg, int16_t fallback, svint16_t data)
// CHECK-LABEL: @test_svclastb_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CHECK-NEXT: ret i32 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT: ret i32 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclastb_n_s32u10__SVBool_tiu11__SVInt32_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i32 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CPP-CHECK-NEXT: ret i32 [[TMP4]]
//
int32_t test_svclastb_n_s32(svbool_t pg, int32_t fallback, svint32_t data)
{
@@ -249,14 +261,20 @@ int32_t test_svclastb_n_s32(svbool_t pg, int32_t fallback, svint32_t data)
// CHECK-LABEL: @test_svclastb_n_s64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CHECK-NEXT: ret i64 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT: ret i64 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclastb_n_s64u10__SVBool_tlu11__SVInt64_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i64 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CPP-CHECK-NEXT: ret i64 [[TMP4]]
//
int64_t test_svclastb_n_s64(svbool_t pg, int64_t fallback, svint64_t data)
{
@@ -281,14 +299,20 @@ uint8_t test_svclastb_n_u8(svbool_t pg, uint8_t fallback, svuint8_t data)
// CHECK-LABEL: @test_svclastb_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CHECK-NEXT: ret i16 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CHECK-NEXT: ret i16 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclastb_n_u16u10__SVBool_ttu12__SVUint16_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i16 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CPP-CHECK-NEXT: ret i16 [[TMP4]]
//
uint16_t test_svclastb_n_u16(svbool_t pg, uint16_t fallback, svuint16_t data)
{
@@ -298,14 +322,20 @@ uint16_t test_svclastb_n_u16(svbool_t pg, uint16_t fallback, svuint16_t data)
// CHECK-LABEL: @test_svclastb_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CHECK-NEXT: ret i32 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT: ret i32 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclastb_n_u32u10__SVBool_tju12__SVUint32_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i32 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CPP-CHECK-NEXT: ret i32 [[TMP4]]
//
uint32_t test_svclastb_n_u32(svbool_t pg, uint32_t fallback, svuint32_t data)
{
@@ -315,14 +345,20 @@ uint32_t test_svclastb_n_u32(svbool_t pg, uint32_t fallback, svuint32_t data)
// CHECK-LABEL: @test_svclastb_n_u64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CHECK-NEXT: ret i64 [[TMP1]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT: ret i64 [[TMP4]]
//
// CPP-CHECK-LABEL: @_Z19test_svclastb_n_u64u10__SVBool_tmu12__SVUint64_t(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CPP-CHECK-NEXT: ret i64 [[TMP1]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CPP-CHECK-NEXT: ret i64 [[TMP4]]
//
uint64_t test_svclastb_n_u64(svbool_t pg, uint64_t fallback, svuint64_t data)
{
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 41c7a8c5042ff..274a025e82a09 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -796,6 +796,50 @@ static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
return IC.replaceInstUsesWith(II, Extract);
}
+static Optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
+ IntrinsicInst &II) {
+ // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
+ // integer variant across a variety of micro-architectures. Replace scalar
+ // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
+ // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
+ // depending on the micro-architecture, but has been observed as generally
+ // being faster, particularly when the CLAST[AB] op is a loop-carried
+ // dependency.
+ IRBuilder<> Builder(II.getContext());
+ Builder.SetInsertPoint(&II);
+ Value *Pg = II.getArgOperand(0);
+ Value *Fallback = II.getArgOperand(1);
+ Value *Vec = II.getArgOperand(2);
+ Type *Ty = II.getType();
+
+ if (!Ty->isIntegerTy())
+ return None;
+
+ Type *FPTy;
+ switch (cast<IntegerType>(Ty)->getBitWidth()) {
+ default:
+ return None;
+ case 16:
+ FPTy = Builder.getHalfTy();
+ break;
+ case 32:
+ FPTy = Builder.getFloatTy();
+ break;
+ case 64:
+ FPTy = Builder.getDoubleTy();
+ break;
+ }
+
+ Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
+ auto *FPVTy = VectorType::get(
+ FPTy, cast<VectorType>(Vec->getType())->getElementCount());
+ Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
+ auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
+ {Pg, FPFallBack, FPVec});
+ Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
+ return IC.replaceInstUsesWith(II, FPIItoInt);
+}
+
static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
@@ -1294,6 +1338,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_lasta:
case Intrinsic::aarch64_sve_lastb:
return instCombineSVELast(IC, II);
+ case Intrinsic::aarch64_sve_clasta_n:
+ case Intrinsic::aarch64_sve_clastb_n:
+ return instCombineSVECondLast(IC, II);
case Intrinsic::aarch64_sve_cntd:
return instCombineSVECntElts(IC, II, 2);
case Intrinsic::aarch64_sve_cntw:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
new file mode 100644
index 0000000000000..61d93b25eda61
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+target triple = "aarch64"
+
+define i16 @clastb_n_i16(<vscale x 8 x i1> %pg, i16 %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: @clastb_n_i16(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[A:%.*]] to half
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[B:%.*]] to <vscale x 8 x half>
+; CHECK-NEXT: [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+; CHECK-NEXT: ret i16 [[TMP4]]
+;
+ %out = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> %pg, i16 %a, <vscale x 8 x i16> %b)
+ ret i16 %out
+}
+
+define i32 @clastb_n_i32(<vscale x 4 x i1> %pg, i32 %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: @clastb_n_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[A:%.*]] to float
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[B:%.*]] to <vscale x 4 x float>
+; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+; CHECK-NEXT: ret i32 [[TMP4]]
+;
+ %out = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> %pg, i32 %a, <vscale x 4 x i32> %b)
+ ret i32 %out
+}
+
+define i64 @clastb_n_i64(<vscale x 2 x i1> %pg, i64 %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: @clastb_n_i64(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[A:%.*]] to double
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[B:%.*]] to <vscale x 2 x double>
+; CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ %out = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> %pg, i64 %a, <vscale x 2 x i64> %b)
+ ret i64 %out
+}
+
+declare i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1>, i16, <vscale x 8 x i16>)
+declare i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1>, i32, <vscale x 4 x i32>)
+declare i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1>, i64, <vscale x 2 x i64>)
More information about the llvm-commits
mailing list