[clang] [llvm] [AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinsics (PR #120363)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 22:20:43 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
This started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too.
---
Patch is 34.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120363.diff
10 Files Affected:
- (modified) clang/include/clang/Basic/arm_neon.td (+2-8)
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+38-3)
- (modified) clang/test/CodeGen/arm-bf16-convert-intrinsics.c (+11-12)
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (-11)
- (modified) llvm/lib/IR/AutoUpgrade.cpp (+61-25)
- (modified) llvm/lib/Target/AArch64/AArch64InstrFormats.td (+4-7)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+13-11)
- (modified) llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll (+3)
- (modified) llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll (-14)
- (modified) llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll (+58-74)
``````````diff
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef89fa4358dfeb..ddc5391eb3fa23 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
def OP_VCVT_F32_BF16_HI
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
-def OP_VCVT_BF16_F32_LO_A64
- : Op<(call "__a64_vcvtq_low_bf16", $p0)>;
-def OP_VCVT_BF16_F32_A64
- : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
-
def OP_VCVT_BF16_F32_A32
: Op<(call "__a32_vcvt_bf16", $p0)>;
@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
}
let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
- def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
- def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
+ def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
- def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
+ def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;
def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4d4b7428abd505..47e4a10addc167 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7277,7 +7277,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
};
static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
- NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
NEONMAP0(splat_lane_v),
NEONMAP0(splat_laneq_v),
NEONMAP0(splatq_lane_v),
@@ -7377,7 +7376,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP0(vcvtq_f16_s16),
NEONMAP0(vcvtq_f16_u16),
NEONMAP0(vcvtq_f32_v),
- NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
+ NEONMAP0(vcvtq_high_bf16_f32),
+ NEONMAP0(vcvtq_low_bf16_f32),
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7586,7 +7586,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
+ NEONMAP0(vcvth_bf16_f32),
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12040,6 +12040,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return ConstantInt::get(Builder.getInt32Ty(), 0);
}
+ if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
+ return Builder.CreateFPTrunc(
+ Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
+ Builder.getFloatTy()),
+ Builder.getBFloatTy());
+
// Handle MSVC intrinsics before argument evaluation to prevent double
// evaluation.
if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12765,6 +12771,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
"vgetq_lane");
}
+ case NEON::BI__builtin_neon_vcvt_bf16_f32: {
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+ }
+ case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
+ SmallVector<int, 16> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ llvm::Value *Trunc =
+ Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+ return Builder.CreateShuffleVector(
+ Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+ }
+ case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
+ SmallVector<int, 16> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ SmallVector<int, 16> LoMask(4);
+ std::iota(LoMask.begin(), LoMask.end(), 0);
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
+ llvm::Value *Inactive = Builder.CreateShuffleVector(
+ Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
+ llvm::Value *Trunc =
+ Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
+ return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+ }
case clang::AArch64::BI_InterlockedAdd:
case clang::AArch64::BI_InterlockedAdd64: {
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index 51aa5aa758f0c3..93f54c70c340d6 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
+// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
+// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
-// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
-// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
+// CHECK-A64-NEXT: ret bfloat [[TMP0]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 53a66099a92bda..763bf31f378e98 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
-
- // v8.6-A Bfloat Intrinsics
- def int_aarch64_neon_bfcvt
- : DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
- def int_aarch64_neon_bfcvtn
- : DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
- def int_aarch64_neon_bfcvtn2
- : DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
- [llvm_v8bf16_ty, llvm_v4f32_ty],
- [IntrNoMem]>;
-
// v8.2-A FP16 Fused Multiply-Add Long
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 06e62bf7f9f757..be67bed087b81e 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -45,6 +45,7 @@
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstring>
+#include <numeric>
using namespace llvm;
@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return true;
}
}
+
+ // Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
+ if (Name.starts_with("bfcvt")) {
+ NewFn = nullptr;
+ return true;
+ }
+
return false; // No other 'aarch64.neon.*'.
}
if (Name.consume_front("sve.")) {
@@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
Function *F, IRBuilder<> &Builder) {
- Intrinsic::ID NewID =
- StringSwitch<Intrinsic::ID>(Name)
- .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
- .Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
- .Default(Intrinsic::not_intrinsic);
- if (NewID == Intrinsic::not_intrinsic)
- llvm_unreachable("Unhandled Intrinsic!");
-
- SmallVector<Value *, 3> Args(CI->args());
-
- // The original intrinsics incorrectly used a predicate based on the smallest
- // element type rather than the largest.
- Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
- Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
-
- if (Args[1]->getType() != BadPredTy)
- llvm_unreachable("Unexpected predicate type!");
-
- Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
- BadPredTy, Args[1]);
- Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
- GoodPredTy, Args[1]);
-
- return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
- CI->getName());
+ if (Name.starts_with("neon.bfcvt")) {
+ if (Name.starts_with("neon.bfcvtn2")) {
+ SmallVector<int, 32> LoMask(4);
+ std::iota(LoMask.begin(), LoMask.end(), 0);
+ SmallVector<int, 32> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
+ Value *Trunc =
+ Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
+ return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+ } else if (Name.starts_with("neon.bfcvtn")) {
+ SmallVector<int, 32> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Type *V4BF16 =
+ FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
+ Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
+ dbgs() << "Trunc: " << *Trunc << "\n";
+ return Builder.CreateShuffleVector(
+ Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+ } else {
+ return Builder.CreateFPTrunc(CI->getOperand(0),
+ Type::getBFloatTy(F->getContext()));
+ }
+ } else if (Name.starts_with("sve.fcvt")) {
+ Intrinsic::ID NewID =
+ StringSwitch<Intrinsic::ID>(Name)
+ .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
+ .Case("sve.fcvtnt.bf16f32",
+ Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
+ .Default(Intrinsic::not_intrinsic);
+ if (NewID == Intrinsic::not_intrinsic)
+ llvm_unreachable("Unhandled Intrinsic!");
+
+ SmallVector<Value *, 3> Args(CI->args());
+
+ // The original intrinsics incorrectly used a predicate based on the
+ // smallest element type rather than the largest.
+ Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
+ Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
+
+ if (Args[1]->getType() != BadPredTy)
+ llvm_unreachable("Unexpected predicate type!");
+
+ Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
+ BadPredTy, Args[1]);
+ Args[1] = Builder.CreateIntrinsic(
+ Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
+
+ return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
+ CI->getName());
+ }
+
+ llvm_unreachable("Unhandled Intrinsic!");
}
static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 56ff7b0d3a280d..a03d97cd81d0a0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -9045,22 +9045,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN
- : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+ : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
"bfcvtn", ".4h", ".4s",
- [(set (v8bf16 V128:$Rd),
- (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+ [(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN2
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
- "bfcvtn2", ".8h", ".4s",
- [(set (v8bf16 V128:$dst),
- (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+ "bfcvtn2", ".8h", ".4s", []>;
let mayRaiseFPException = 1, Uses = [FPCR] in
class BF16ToSinglePrecision<string asm>
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
- [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
+ [(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d015cc15581ad0..825bf130e9baa7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1446,8 +1446,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
def BFCVTN : SIMD_BFCVTN;
def BFCVTN2 : SIMD_BFCVTN2;
-def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
- (EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
+def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
+ (BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
// Vector-scalar BFDOT:
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1469,8 +1469,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
-// Round FP32 to BF16.
-def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
}
// ARMv8.6A AArch64 matrix multiplication
@@ -10425,9 +10423,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn)),
(v8bf16 (BFCVTN2
- (v8bf16 (BFCVTN
- (v4f32 (OutInst
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+ (INSERT_SUBREG (IMPLICIT_DEF),
+ (v4bf16 (BFCVTN
+ (v4f32 (OutInst
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+ dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
let Predicates = [HasNoBF16] in
@@ -10462,10 +10462,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
(v8bf16 (BFCVTN2
- (v8bf16 (BFCVTN
- (v4f32 (OutInst
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+ (INSERT_SUBREG (IMPLICIT_DEF),
+ (v4bf16 (BFCVTN
+ (v4f32 (OutInst
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+ dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
diff --git a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
index 9d4e79d38d5d1a..64bc95f2f38906 100644
--- a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
@@ -1,5 +1,8 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s
+; This test acts to test the old neon.bfcvt intrinsics, which are now
+; autoupgraded to fptrunc operations.
+
declare bfloat @llvm.aarch64.neon.bfcvt(float)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
index 9b6e19eba3f4e6..1cd0294b0083eb 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -22,7 +22,6 @@ define <4 x bfloat> @add_h(<4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-BF16-NEXT: ret
entry:
@@ -62,7 +61,6 @@ define <4 x bfloat> @sub_h(<4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-BF16-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/120363
More information about the llvm-commits
mailing list