[llvm] r331032 - [ARM] Codegen for v8.2A dot product intrinsics
Oliver Stannard via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 27 05:50:41 PDT 2018
Author: olista01
Date: Fri Apr 27 05:50:40 2018
New Revision: 331032
URL: http://llvm.org/viewvc/llvm-project?rev=331032&view=rev
Log:
[ARM] Codegen for v8.2A dot product intrinsics
This adds IR intrinsics for the ARM dot-product instructions introduced in
v8.2-A.
Differential revision: https://reviews.llvm.org/D46106
Added:
llvm/trunk/test/CodeGen/ARM/neon-dot-product.ll
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsARM.td
llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
Modified: llvm/trunk/include/llvm/IR/IntrinsicsARM.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsARM.td?rev=331032&r1=331031&r2=331032&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsARM.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsARM.td Fri Apr 27 05:50:40 2018
@@ -717,4 +717,14 @@ def int_arm_neon_sha256h: SHA_3Arg_v4i32
def int_arm_neon_sha256h2: SHA_3Arg_v4i32_Intrinsic;
def int_arm_neon_sha256su1: SHA_3Arg_v4i32_Intrinsic;
+// Armv8.2-A dot product instructions
+class Neon_Dot_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_anyvector_ty,
+ LLVMMatchType<1>],
+ [IntrNoMem]>;
+def int_arm_neon_udot : Neon_Dot_Intrinsic;
+def int_arm_neon_sdot : Neon_Dot_Intrinsic;
+
+
} // end TargetPrefix
Modified: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrNEON.td?rev=331032&r1=331031&r2=331032&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td Fri Apr 27 05:50:40 2018
@@ -4682,37 +4682,59 @@ def : Pat<(v4f32 (fma (fneg QPR:$Vn), QP
// We put them in the VFPV8 decoder namespace because the ARM and Thumb
// encodings are the same and thus no further bit twiddling is necessary
// in the disassembler.
-let Predicates = [HasDotProd], DecoderNamespace = "VFPV8" in {
+class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
+ ValueType AccumTy, ValueType InputTy,
+ SDPatternOperator OpNode> :
+ N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
+ (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD,
+ Asm, AsmTy,
+ [(set (AccumTy RegTy:$dst),
+ (OpNode (AccumTy RegTy:$Vd),
+ (InputTy RegTy:$Vn),
+ (InputTy RegTy:$Vm)))]> {
+ let Predicates = [HasDotProd];
+ let DecoderNamespace = "VFPV8";
+ let Constraints = "$dst = $Vd";
+}
-def VUDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b1,
- (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm),
- N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>;
-def VSDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b0,
- (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm),
- N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>;
-def VUDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b1,
- (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm),
- N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>;
-def VSDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b0,
- (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm),
- N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>;
+def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>;
+def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>;
+def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
+def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
// Indexed dot product instructions:
-class DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty> :
- N3Vnp<0b11100, 0b10, 0b1101, Q, U,
- (outs Ty:$Vd), (ins Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
- N3RegFrm, IIC_VDOTPROD, opc, dt, []> {
- bit lane;
- let Inst{5} = lane;
- let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane");
+multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty,
+ ValueType AccumType, ValueType InputType, SDPatternOperator OpNode,
+ dag RHS> {
+ def "" : N3Vnp<0b11100, 0b10, 0b1101, Q, U, (outs Ty:$dst),
+ (ins Ty:$Vd, Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+ N3RegFrm, IIC_VDOTPROD, opc, dt, []> {
+ bit lane;
+ let Inst{5} = lane;
+ let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane");
+ let Constraints = "$dst = $Vd";
+ let Predicates = [HasDotProd];
+ let DecoderNamespace = "VFPV8";
+ }
+
+ def : Pat<
+ (AccumType (OpNode (AccumType Ty:$Vd),
+ (InputType Ty:$Vn),
+ (InputType (bitconvert (AccumType
+ (NEONvduplane (AccumType Ty:$Vm),
+ VectorIndex32:$lane)))))),
+ (!cast<Instruction>(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>;
}
-def VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR>;
-def VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR>;
-def VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR>;
-def VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR>;
+defm VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR, v2i32, v8i8,
+ int_arm_neon_udot, (v2i32 DPR_VFP2:$Vm)>;
+defm VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR, v2i32, v8i8,
+ int_arm_neon_sdot, (v2i32 DPR_VFP2:$Vm)>;
+defm VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR, v4i32, v16i8,
+ int_arm_neon_udot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8,
+ int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
-} // HasDotProd
// ARMv8.3 complex operations
class BaseN3VCP8ComplexTied<bit op21, bit op4, bit s, bit q,
Added: llvm/trunk/test/CodeGen/ARM/neon-dot-product.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/neon-dot-product.ll?rev=331032&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/neon-dot-product.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/neon-dot-product.ll Fri Apr 27 05:50:40 2018
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple armv8a-none-linux-gnu -mattr=+dotprod -float-abi=hard < %s | FileCheck %s
+
+declare <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
+declare <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
+declare <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
+
+define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+entry:
+; CHECK-LABEL: test_vdot_u32:
+; CHECK: vudot.u8 d0, d1, d2
+ %vdot1.i = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
+ ret <2 x i32> %vdot1.i
+}
+
+define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+entry:
+; CHECK-LABEL: test_vdotq_u32:
+; CHECK: vudot.u8 q0, q1, q2
+ %vdot1.i = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
+ ret <4 x i32> %vdot1.i
+}
+
+define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+entry:
+; CHECK-LABEL: test_vdot_s32:
+; CHECK: vsdot.s8 d0, d1, d2
+ %vdot1.i = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
+ ret <2 x i32> %vdot1.i
+}
+
+define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+entry:
+; CHECK-LABEL: test_vdotq_s32:
+; CHECK: vsdot.s8 q0, q1, q2
+ %vdot1.i = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
+ ret <4 x i32> %vdot1.i
+}
+
+define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
+entry:
+; CHECK-LABEL: test_vdot_lane_u32:
+; CHECK: vudot.u8 d0, d1, d2[1]
+ %.cast = bitcast <8 x i8> %c to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
+ %vdot1.i = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
+ ret <2 x i32> %vdot1.i
+}
+
+define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
+entry:
+; CHECK-LABEL: test_vdotq_lane_u32:
+; CHECK: vudot.u8 q0, q1, d4[1]
+ %.cast = bitcast <8 x i8> %c to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
+ %vdot1.i = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
+ ret <4 x i32> %vdot1.i
+}
+
+define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
+entry:
+; CHECK-LABEL: test_vdot_lane_s32:
+; CHECK: vsdot.s8 d0, d1, d2[1]
+ %.cast = bitcast <8 x i8> %c to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
+ %vdot1.i = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
+ ret <2 x i32> %vdot1.i
+}
+
+define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
+entry:
+; CHECK-LABEL: test_vdotq_lane_s32:
+; CHECK: vsdot.s8 q0, q1, d4[1]
+ %.cast = bitcast <8 x i8> %c to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
+ %vdot1.i = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
+ ret <4 x i32> %vdot1.i
+}
More information about the llvm-commits
mailing list