[llvm] 7c3cda5 - [AArch64][SVE] Prefer SIMD&FP variant of clast[ab]

Wed Jul 13 01:53:57 PDT 2022

Author: Cullen Rhodes
Date: 2022-07-13T08:53:36Z
New Revision: 7c3cda551ac702de4eb8899180aa715896020d43

URL: https://github.com/llvm/llvm-project/commit/7c3cda551ac702de4eb8899180aa715896020d43
DIFF: https://github.com/llvm/llvm-project/commit/7c3cda551ac702de4eb8899180aa715896020d43.diff

LOG: [AArch64][SVE] Prefer SIMD&FP variant of clast[ab]

The scalar variant with GPR source/dest has considerably higher latency
than the SIMD&FP scalar variant across a variety of micro-architectures:

  Core           Scalar    SIMD&FP
  --------------------------------
  Neoverse V1     9 cyc      3 cyc
  Neoverse N2     8 cyc      3 cyc
  Cortex A510     8 cyc      4 cyc
  A64FX          29 cyc      6 cyc

Added: 
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll

Modified: 
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
index c6cf55accf820..d3d273a6fcbc4 100644

--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
@@ -215,14 +215,20 @@ int8_t test_svclasta_n_s8(svbool_t pg, int8_t fallback, svint8_t data)
 // CHECK-LABEL: @test_svclasta_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CHECK-NEXT:    ret i16 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclasta_n_s16u10__SVBool_tsu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i16 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CPP-CHECK-NEXT:    ret i16 [[TMP4]]
 //
 int16_t test_svclasta_n_s16(svbool_t pg, int16_t fallback, svint16_t data)
 {
@@ -232,14 +238,20 @@ int16_t test_svclasta_n_s16(svbool_t pg, int16_t fallback, svint16_t data)
 // CHECK-LABEL: @test_svclasta_n_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT:    ret i32 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclasta_n_s32u10__SVBool_tiu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i32 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CPP-CHECK-NEXT:    ret i32 [[TMP4]]
 //
 int32_t test_svclasta_n_s32(svbool_t pg, int32_t fallback, svint32_t data)
 {
@@ -249,14 +261,20 @@ int32_t test_svclasta_n_s32(svbool_t pg, int32_t fallback, svint32_t data)
 // CHECK-LABEL: @test_svclasta_n_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CHECK-NEXT:    ret i64 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT:    ret i64 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclasta_n_s64u10__SVBool_tlu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i64 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CPP-CHECK-NEXT:    ret i64 [[TMP4]]
 //
 int64_t test_svclasta_n_s64(svbool_t pg, int64_t fallback, svint64_t data)
 {
@@ -281,14 +299,20 @@ uint8_t test_svclasta_n_u8(svbool_t pg, uint8_t fallback, svuint8_t data)
 // CHECK-LABEL: @test_svclasta_n_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CHECK-NEXT:    ret i16 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclasta_n_u16u10__SVBool_ttu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i16 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CPP-CHECK-NEXT:    ret i16 [[TMP4]]
 //
 uint16_t test_svclasta_n_u16(svbool_t pg, uint16_t fallback, svuint16_t data)
 {
@@ -298,14 +322,20 @@ uint16_t test_svclasta_n_u16(svbool_t pg, uint16_t fallback, svuint16_t data)
 // CHECK-LABEL: @test_svclasta_n_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT:    ret i32 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclasta_n_u32u10__SVBool_tju12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i32 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CPP-CHECK-NEXT:    ret i32 [[TMP4]]
 //
 uint32_t test_svclasta_n_u32(svbool_t pg, uint32_t fallback, svuint32_t data)
 {
@@ -315,14 +345,20 @@ uint32_t test_svclasta_n_u32(svbool_t pg, uint32_t fallback, svuint32_t data)
 // CHECK-LABEL: @test_svclasta_n_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CHECK-NEXT:    ret i64 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT:    ret i64 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclasta_n_u64u10__SVBool_tmu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i64 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CPP-CHECK-NEXT:    ret i64 [[TMP4]]
 //
 uint64_t test_svclasta_n_u64(svbool_t pg, uint64_t fallback, svuint64_t data)
 {

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
index 1473d0ab62070..1b866c43ae167 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
@@ -215,14 +215,20 @@ int8_t test_svclastb_n_s8(svbool_t pg, int8_t fallback, svint8_t data)
 // CHECK-LABEL: @test_svclastb_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CHECK-NEXT:    ret i16 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclastb_n_s16u10__SVBool_tsu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i16 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CPP-CHECK-NEXT:    ret i16 [[TMP4]]
 //
 int16_t test_svclastb_n_s16(svbool_t pg, int16_t fallback, svint16_t data)
 {
@@ -232,14 +238,20 @@ int16_t test_svclastb_n_s16(svbool_t pg, int16_t fallback, svint16_t data)
 // CHECK-LABEL: @test_svclastb_n_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT:    ret i32 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclastb_n_s32u10__SVBool_tiu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i32 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CPP-CHECK-NEXT:    ret i32 [[TMP4]]
 //
 int32_t test_svclastb_n_s32(svbool_t pg, int32_t fallback, svint32_t data)
 {
@@ -249,14 +261,20 @@ int32_t test_svclastb_n_s32(svbool_t pg, int32_t fallback, svint32_t data)
 // CHECK-LABEL: @test_svclastb_n_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CHECK-NEXT:    ret i64 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT:    ret i64 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclastb_n_s64u10__SVBool_tlu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i64 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CPP-CHECK-NEXT:    ret i64 [[TMP4]]
 //
 int64_t test_svclastb_n_s64(svbool_t pg, int64_t fallback, svint64_t data)
 {
@@ -281,14 +299,20 @@ uint8_t test_svclastb_n_u8(svbool_t pg, uint8_t fallback, svuint8_t data)
 // CHECK-LABEL: @test_svclastb_n_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CHECK-NEXT:    ret i16 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclastb_n_u16u10__SVBool_ttu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> [[TMP0]], i16 [[FALLBACK:%.*]], <vscale x 8 x i16> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i16 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[FALLBACK:%.*]] to half
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[TMP0]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+// CPP-CHECK-NEXT:    ret i16 [[TMP4]]
 //
 uint16_t test_svclastb_n_u16(svbool_t pg, uint16_t fallback, svuint16_t data)
 {
@@ -298,14 +322,20 @@ uint16_t test_svclastb_n_u16(svbool_t pg, uint16_t fallback, svuint16_t data)
 // CHECK-LABEL: @test_svclastb_n_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT:    ret i32 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclastb_n_u32u10__SVBool_tju12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> [[TMP0]], i32 [[FALLBACK:%.*]], <vscale x 4 x i32> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i32 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[FALLBACK:%.*]] to float
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[TMP0]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CPP-CHECK-NEXT:    ret i32 [[TMP4]]
 //
 uint32_t test_svclastb_n_u32(svbool_t pg, uint32_t fallback, svuint32_t data)
 {
@@ -315,14 +345,20 @@ uint32_t test_svclastb_n_u32(svbool_t pg, uint32_t fallback, svuint32_t data)
 // CHECK-LABEL: @test_svclastb_n_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CHECK-NEXT:    ret i64 [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT:    ret i64 [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svclastb_n_u64u10__SVBool_tmu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> [[TMP0]], i64 [[FALLBACK:%.*]], <vscale x 2 x i64> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret i64 [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[FALLBACK:%.*]] to double
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[TMP0]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CPP-CHECK-NEXT:    ret i64 [[TMP4]]
 //
 uint64_t test_svclastb_n_u64(svbool_t pg, uint64_t fallback, svuint64_t data)
 {

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 41c7a8c5042ff..274a025e82a09 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -796,6 +796,50 @@ static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, Extract);
 }
 
+static Optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
+                                                      IntrinsicInst &II) {
+  // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
+  // integer variant across a variety of micro-architectures. Replace scalar
+  // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
+  // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
+  // depending on the micro-architecture, but has been observed as generally
+  // being faster, particularly when the CLAST[AB] op is a loop-carried
+  // dependency.
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  Value *Pg = II.getArgOperand(0);
+  Value *Fallback = II.getArgOperand(1);
+  Value *Vec = II.getArgOperand(2);
+  Type *Ty = II.getType();
+
+  if (!Ty->isIntegerTy())
+    return None;
+
+  Type *FPTy;
+  switch (cast<IntegerType>(Ty)->getBitWidth()) {
+  default:
+    return None;
+  case 16:
+    FPTy = Builder.getHalfTy();
+    break;
+  case 32:
+    FPTy = Builder.getFloatTy();
+    break;
+  case 64:
+    FPTy = Builder.getDoubleTy();
+    break;
+  }
+
+  Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
+  auto *FPVTy = VectorType::get(
+      FPTy, cast<VectorType>(Vec->getType())->getElementCount());
+  Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
+  auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
+                                       {Pg, FPFallBack, FPVec});
+  Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
+  return IC.replaceInstUsesWith(II, FPIItoInt);
+}
+
 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
                                                 IntrinsicInst &II) {
   LLVMContext &Ctx = II.getContext();
@@ -1294,6 +1338,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_lasta:
   case Intrinsic::aarch64_sve_lastb:
     return instCombineSVELast(IC, II);
+  case Intrinsic::aarch64_sve_clasta_n:
+  case Intrinsic::aarch64_sve_clastb_n:
+    return instCombineSVECondLast(IC, II);
   case Intrinsic::aarch64_sve_cntd:
     return instCombineSVECntElts(IC, II, 2);
   case Intrinsic::aarch64_sve_cntw:

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
new file mode 100644
index 0000000000000..61d93b25eda61
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+target triple = "aarch64"
+
+define i16 @clastb_n_i16(<vscale x 8 x i1> %pg, i16 %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: @clastb_n_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[A:%.*]] to half
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i16> [[B:%.*]] to <vscale x 8 x half>
+; CHECK-NEXT:    [[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], half [[TMP1]], <vscale x 8 x half> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+; CHECK-NEXT:    ret i16 [[TMP4]]
+;
+  %out = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1> %pg, i16 %a, <vscale x 8 x i16> %b)
+  ret i16 %out
+}
+
+define i32 @clastb_n_i32(<vscale x 4 x i1> %pg, i32 %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: @clastb_n_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[A:%.*]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[B:%.*]] to <vscale x 4 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], float [[TMP1]], <vscale x 4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %out = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1> %pg, i32 %a, <vscale x 4 x i32> %b)
+  ret i32 %out
+}
+
+define i64 @clastb_n_i64(<vscale x 2 x i1> %pg, i64 %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: @clastb_n_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[B:%.*]] to <vscale x 2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], double [[TMP1]], <vscale x 2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+  %out = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1> %pg, i64 %a, <vscale x 2 x i64> %b)
+  ret i64 %out
+}
+
+declare i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1>, i16, <vscale x 8 x i16>)
+declare i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1>, i32, <vscale x 4 x i32>)
+declare i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1>, i64, <vscale x 2 x i64>)