[llvm] 9c57954 - [ARM] BFloat MatMul Intrinsics&CodeGen
Mikhail Maltsev via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 23 05:06:52 PDT 2020
Author: Mikhail Maltsev
Date: 2020-06-23T12:06:37Z
New Revision: 9c579540ff6945d126a680d01db38548db9a972d
URL: https://github.com/llvm/llvm-project/commit/9c579540ff6945d126a680d01db38548db9a972d
DIFF: https://github.com/llvm/llvm-project/commit/9c579540ff6945d126a680d01db38548db9a972d.diff
LOG: [ARM] BFloat MatMul Intrinsics&CodeGen
Summary:
This patch adds support for BFloat Matrix Multiplication Intrinsics
and Code Generation from __bf16 to AArch32. This includes IR intrinsics. Tests are
provided as needed.
This patch is part of a series implementing the Bfloat16 extension of
the
Armv8.6-a architecture, as detailed here:
https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a
The bfloat type and its properties are specified in the Arm
Architecture
Reference Manual:
https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile
The following people contributed to this patch:
- Luke Geeson
- Momchil Velikov
- Mikhail Maltsev
- Luke Cheeseman
- Simon Tatham
Reviewers: stuij, t.p.northover, SjoerdMeijer, sdesmalen, fpetrogalli, LukeGeeson, simon_tatham, dmgreen, MarkMurrayARM
Reviewed By: MarkMurrayARM
Subscribers: MarkMurrayARM, danielkiss, kristof.beyls, hiraditya, cfe-commits, llvm-commits, chill, miyuki
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D81740
Added:
clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
Modified:
clang/lib/CodeGen/CGBuiltin.cpp
llvm/include/llvm/IR/IntrinsicsARM.td
llvm/lib/Target/ARM/ARMInstrNEON.td
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 32e43c295d23..6cbc5c20ad36 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4748,6 +4748,11 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP1(vaeseq_v, arm_neon_aese, 0),
NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
+ NEONMAP1(vbfdot_v, arm_neon_bfdot, 0),
+ NEONMAP1(vbfdotq_v, arm_neon_bfdot, 0),
+ NEONMAP1(vbfmlalbq_v, arm_neon_bfmlalb, 0),
+ NEONMAP1(vbfmlaltq_v, arm_neon_bfmlalt, 0),
+ NEONMAP1(vbfmmlaq_v, arm_neon_bfmmla, 0),
NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
NEONMAP1(vcadd_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
new file mode 100644
index 000000000000..0eb130a377bd
--- /dev/null
+++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
@@ -0,0 +1,166 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple armv8-arm-none-eabi \
+// RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi soft \
+// RUN: -disable-O0-optnone -S -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg -instcombine | FileCheck %s
+// RUN: %clang_cc1 -triple armv8-arm-none-eabi \
+// RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
+// RUN: -disable-O0-optnone -S -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg -instcombine | FileCheck %s
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: @test_vbfdot_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
+//
+float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
+ return vbfdot_f32(r, a, b);
+}
+
+// CHECK-LABEL: @test_vbfdotq_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
+//
+float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
+ return vbfdotq_f32(r, a, b);
+}
+
+// CHECK-LABEL: @test_vbfdot_lane_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3
+// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
+//
+float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
+ return vbfdot_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: @test_vbfdotq_laneq_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
+//
+float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+ return vbfdotq_laneq_f32(r, a, b, 3);
+}
+
+// CHECK-LABEL: @test_vbfdot_laneq_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3
+// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
+//
+float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
+ return vbfdot_laneq_f32(r, a, b, 3);
+}
+
+// CHECK-LABEL: @test_vbfdotq_lane_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
+//
+float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
+ return vbfdotq_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: @test_vbfmmlaq_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VBFMMLA1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFMMLA1_I]]
+//
+float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+ return vbfmmlaq_f32(r, a, b);
+}
+
+// CHECK-LABEL: @test_vbfmlalbq_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
+//
+float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+ return vbfmlalbq_f32(r, a, b);
+}
+
+// CHECK-LABEL: @test_vbfmlaltq_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
+//
+float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+ return vbfmlaltq_f32(r, a, b);
+}
+
+// CHECK-LABEL: @test_vbfmlalbq_lane_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
+// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
+//
+float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
+ return vbfmlalbq_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
+// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
+//
+float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+ return vbfmlalbq_laneq_f32(r, a, b, 3);
+}
+
+// CHECK-LABEL: @test_vbfmlaltq_lane_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
+// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
+//
+float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
+ return vbfmlaltq_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
+// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
+// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
+//
+float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+ return vbfmlaltq_laneq_f32(r, a, b, 3);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index adeafbb267b2..c9a46350cb9c 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -785,6 +785,15 @@ def int_arm_neon_usmmla : Neon_MatMul_Intrinsic;
def int_arm_neon_usdot : Neon_Dot_Intrinsic;
// v8.6-A Bfloat Intrinsics
+def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
+def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;
+
+class Neon_FML_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+ [IntrNoMem]>;
+def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;
+def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;
def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index ac609600d92a..17ad098f1f32 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -9099,15 +9099,20 @@ def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
// ARMv8.6a BFloat16 instructions.
let Predicates = [HasBF16, HasNEON] in {
class BF16VDOT<bits<5> op27_23, bits<2> op21_20, bit op6,
- dag oops, dag iops>
+ dag oops, dag iops, list<dag> pattern>
: N3Vnp<op27_23, op21_20, 0b1101, op6, 0, oops, iops,
- N3RegFrm, IIC_VDOTPROD, "", "", []> {
+ N3RegFrm, IIC_VDOTPROD, "", "", pattern>
+{
let DecoderNamespace = "VFPV8";
}
class BF16VDOTS<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, ValueType InputTy>
: BF16VDOT<0b11000, 0b00, Q, (outs RegTy:$dst),
- (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm)> {
+ (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
+ [(set (AccumTy RegTy:$dst),
+ (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
+ (InputTy RegTy:$Vn),
+ (InputTy RegTy:$Vm)))]> {
let Constraints = "$dst = $Vd";
let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
let DecoderNamespace = "VFPV8";
@@ -9118,7 +9123,7 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
def "" : BF16VDOT<0b11100, 0b00, Q, (outs RegTy:$dst),
(ins RegTy:$Vd, RegTy:$Vn,
- DPR_VFP2:$Vm, VectorIndex32:$lane)> {
+ DPR_VFP2:$Vm, VectorIndex32:$lane), []> {
bit lane;
let Inst{5} = lane;
let Constraints = "$dst = $Vd";
@@ -9126,6 +9131,13 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
let DecoderNamespace = "VFPV8";
}
+ def : Pat<
+ (AccumTy (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
+ (InputTy RegTy:$Vn),
+ (InputTy (bitconvert (AccumTy
+ (ARMvduplane (AccumTy RegTy:$Vm),
+ VectorIndex32:$lane)))))),
+ (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
}
def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
@@ -9138,7 +9150,10 @@ class BF16MM<bit Q, RegisterClass RegTy,
string opc>
: N3Vnp<0b11000, 0b00, 0b1100, Q, 0,
(outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
- N3RegFrm, IIC_VDOTPROD, "", "", []> {
+ N3RegFrm, IIC_VDOTPROD, "", "",
+ [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
+ (v16i8 QPR:$Vn),
+ (v16i8 QPR:$Vm)))]> {
let Constraints = "$dst = $Vd";
let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
let DecoderNamespace = "VFPV8";
@@ -9146,19 +9161,22 @@ class BF16MM<bit Q, RegisterClass RegTy,
def VMMLA : BF16MM<1, QPR, "vmmla">;
-class VBF16MALQ<bit T, string suffix>
+class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
: N3VCP8<0b00, 0b11, T, 1,
(outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm),
NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
- []> { // TODO: Add intrinsics
+ [(set (v4f32 QPR:$dst),
+ (OpNode (v4f32 QPR:$Vd),
+ (v16i8 QPR:$Vn),
+ (v16i8 QPR:$Vm)))]> {
let Constraints = "$dst = $Vd";
let DecoderNamespace = "VFPV8";
}
-def VBF16MALTQ: VBF16MALQ<1, "t">;
-def VBF16MALBQ: VBF16MALQ<0, "b">;
+def VBF16MALTQ: VBF16MALQ<1, "t", int_arm_neon_bfmlalt>;
+def VBF16MALBQ: VBF16MALQ<0, "b", int_arm_neon_bfmlalb>;
-multiclass VBF16MALQI<bit T, string suffix> {
+multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst),
(ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx),
IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> {
@@ -9169,10 +9187,20 @@ multiclass VBF16MALQI<bit T, string suffix> {
let DecoderNamespace = "VFPV8";
}
-}
-
-defm VBF16MALTQI: VBF16MALQI<1, "t">;
-defm VBF16MALBQI: VBF16MALQI<0, "b">;
+ def : Pat<
+ (v4f32 (OpNode (v4f32 QPR:$Vd),
+ (v16i8 QPR:$Vn),
+ (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
+ VectorIndex16:$lane)))))),
+ (!cast<Instruction>(NAME) QPR:$Vd,
+ QPR:$Vn,
+ (EXTRACT_SUBREG QPR:$Vm,
+ (DSubReg_i16_reg VectorIndex16:$lane)),
+ (SubReg_i16_lane VectorIndex16:$lane))>;
+}
+
+defm VBF16MALTQI: VBF16MALQI<1, "t", int_arm_neon_bfmlalt>;
+defm VBF16MALBQI: VBF16MALQI<0, "b", int_arm_neon_bfmlalb>;
def BF16_VCVT : N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0,
(outs DPR:$Vd), (ins QPR:$Vm),
diff --git a/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
new file mode 100644
index 000000000000..da6e1274e50a
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple armv8.6a-arm-none-eabi -mattr=+neon,+bf16 -float-abi=hard -verify-machineinstrs < %s -o - | FileCheck %s
+
+define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vbfdot_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdot.bf16 d0, d1, d2
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <4 x bfloat> %a to <8 x i8>
+ %1 = bitcast <4 x bfloat> %b to <8 x i8>
+ %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
+ ret <2 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfdotq_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdot.bf16 q0, q1, q2
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %b to <16 x i8>
+ %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfdot1.i
+}
+
+define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vbfdot_lane_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdot.bf16 d0, d1, d2[0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <4 x bfloat> %b to <2 x float>
+ %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
+ %1 = bitcast <4 x bfloat> %a to <8 x i8>
+ %2 = bitcast <2 x float> %shuffle to <8 x i8>
+ %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
+ ret <2 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfdotq_laneq_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdup.32 q8, d5[1]
+; CHECK-NEXT: vdot.bf16 q0, q1, q8
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x bfloat> %b to <4 x float>
+ %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %1 = bitcast <8 x bfloat> %a to <16 x i8>
+ %2 = bitcast <4 x float> %shuffle to <16 x i8>
+ %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
+ ret <4 x float> %vbfdot1.i
+}
+
+define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfdot_laneq_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdot.bf16 d0, d1, d3[1]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x bfloat> %b to <4 x float>
+ %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+ %1 = bitcast <4 x bfloat> %a to <8 x i8>
+ %2 = bitcast <2 x float> %shuffle to <8 x i8>
+ %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
+ ret <2 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vbfdotq_lane_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT: vdot.bf16 q0, q1, d4[0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <4 x bfloat> %b to <2 x float>
+ %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
+ %1 = bitcast <8 x bfloat> %a to <16 x i8>
+ %2 = bitcast <4 x float> %shuffle to <16 x i8>
+ %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
+ ret <4 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfmmlaq_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmmla.bf16 q0, q1, q2
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %b to <16 x i8>
+ %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfmmla1.i
+}
+
+define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfmlalbq_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vfmab.bf16 q0, q1, q2
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %b to <16 x i8>
+ %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfmlalb1.i
+}
+
+define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfmlaltq_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vfmat.bf16 q0, q1, q2
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %b to <16 x i8>
+ %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfmlalt1.i
+}
+
+define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vbfmlalbq_lane_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT: vfmab.bf16 q0, q1, d4[0]
+; CHECK-NEXT: bx lr
+entry:
+ %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+ %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfmlalb1.i
+}
+
+define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfmlalbq_laneq_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vfmab.bf16 q0, q1, d4[3]
+; CHECK-NEXT: bx lr
+entry:
+ %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+ %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfmlalb1.i
+}
+
+define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vbfmlaltq_lane_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT: vfmat.bf16 q0, q1, d4[0]
+; CHECK-NEXT: bx lr
+entry:
+ %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+ %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfmlalt1.i
+}
+
+define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfmlaltq_laneq_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vfmat.bf16 q0, q1, d4[3]
+; CHECK-NEXT: bx lr
+entry:
+ %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+ %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfmlalt1.i
+}
+
+define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vbfmlaltq_laneq_f32_v2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdup.16 q8, d5[2]
+; CHECK-NEXT: vfmat.bf16 q0, q1, q8
+; CHECK-NEXT: bx lr
+entry:
+ %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+ %0 = bitcast <8 x bfloat> %a to <16 x i8>
+ %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+ %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+ ret <4 x float> %vbfmlalt1.i
+}
+
+declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
+declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
More information about the llvm-commits
mailing list