commit 10194e87b47c60eb43197b68441185b60fa8f1a3 Author: Ana Pazos Date: Wed Nov 13 11:57:09 2013 -0800 Implemented aarch64 vmul_lane intrinsics Implemented aarch64 vmulx_lane intrinsics Implemented aarch64 vmul_n_f64 intrinsic, mapping it to Neon scalar operation Implemented aarch64 vmul_lane_f64 and vmul_laneq_f64 intrinsics, mapping them to Neon scalar operation Added codegen patterns for scalar copy (DUP) with FP types Added Scalar Copy (DUP) MOV aliases Assumed the following prototypes (ACLE document version 5): float64x1_t vmul_n_f64(float64x1_t a, float64_t b) float64x1_t vmul_lane_f64(float64x1_t a, float64x1_t b) float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t b) To force vmul_n_f64, vmul_lane_f64, vmul_laneq_f64 intrinsics to map to Neon scalar operations an aarch64 IR intrinsic was created. Otherwise the casting of result to float64x1_t might cause non-vector code to be created and a different instruction to be invoked. diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td index cfde227..a877921 100644 --- a/include/clang/Basic/arm_neon.td +++ b/include/clang/Basic/arm_neon.td @@ -107,6 +107,12 @@ def OP_MOVL_HI : Op; def OP_COPY_LN : Op; def OP_COPYQ_LN : Op; def OP_COPY_LNQ : Op; +def OP_SCALAR_MUL_LN : Op; +def OP_SCALAR_MUL_LNQ : Op; +def OP_SCALAR_MULX_LN : Op; +def OP_SCALAR_MULX_LNQ : Op; +def OP_SCALAR_VMULX_LN : Op; +def OP_SCALAR_VMULX_LNQ : Op; class Inst { string Name = n; @@ -116,6 +122,7 @@ class Inst { bit isShift = 0; bit isScalarShift = 0; bit isVCVT_N = 0; + bit isVMUL_LANEQ = 0; bit isA64 = 0; // Certain intrinsics have different names than their representative @@ -744,10 +751,12 @@ def VQDMLSL_HIGH_LANEQ : SOpInst<"vqdmlsl_high_laneq", "wwkki", "si", OP_QDMLSLHi_LN>; // Newly add double parameter for vmul_lane in aarch64 -def VMUL_LANE_A64 : IOpInst<"vmul_lane", "ddgi", "dQd", OP_MUL_LN>; +// Note: d type is mapped to scalar vmul_lane +def VMUL_LANE_A64 : IOpInst<"vmul_lane", "ddgi", "Qd", OP_MUL_LN>; +// Note: d type is mapped to scalar vmul_lane def VMUL_LANEQ : IOpInst<"vmul_laneq", "ddji", - "sifdUsUiQsQiQfQUsQUiQfQd", OP_MUL_LN>; + "sifUsUiQsQiQfQUsQUiQfQd", OP_MUL_LN>; def VMULL_LANEQ : SOpInst<"vmull_laneq", "wdki", "siUsUi", OP_MULL_LN>; def VMULL_HIGH_LANE : SOpInst<"vmull_high_lane", "wkdi", "siUsUi", OP_MULLHi_LN>; @@ -763,8 +772,10 @@ def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "wkki", "si", def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "ddji", "siQsQi", OP_QDMULH_LN>; def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ddji", "siQsQi", OP_QRDMULH_LN>; -def VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "fdQfQd", OP_MULX_LN>; -def VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "fdQfQd", OP_MULX_LN>; +// Note: d type is mapped to scalar vmulx_lane +def VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "fQfQd", OP_MULX_LN>; +// Note: d type is mapped to scalar vmulx_lane +def VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "fQfQd", OP_MULX_LN>; //////////////////////////////////////////////////////////////////////////////// // Across vectors class @@ -1028,4 +1039,27 @@ def SCALAR_SQXTN : SInst<"vqmovn", "zs", "SsSiSl">; //////////////////////////////////////////////////////////////////////////////// // Scalar Unsigned Saturating Extract Narrow def SCALAR_UQXTN : SInst<"vqmovn", "zs", "SUsSUiSUl">; + +def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "ssdi", "SfSd", OP_SCALAR_MUL_LN>; +def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "ssji", "SfSd", OP_SCALAR_MUL_LNQ>; + +def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "ssdi", "SfSd", OP_SCALAR_MULX_LN>; +def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "ssji", "SfSd", OP_SCALAR_MULX_LNQ>; + +def SCALAR_VMUL_N : IInst<"vmul_n", "dds", "d">; + +// VMUL_LANE_A64 d type implemented using scalar vmul_lane +def SCALAR_VMUL_LANE : IInst<"vmul_lane", "ddgi", "d">; + +// VMULX_LANE d type implemented using scalar vmulx_lane +def SCALAR_VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "d", OP_SCALAR_VMULX_LN>; + +// VMULX_LANEQ d type implemented using scalar vmulx_laneq +def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "d", OP_SCALAR_VMULX_LNQ>; + } + +let isVMUL_LANEQ = 1, isA64 = 1 in { +// VMUL_LANEQ d type implemented using scalar vmul_laneq +def SCALAR_VMUL_LANEQ : IInst<"vmul_laneq", "ddji", "d">; +} \ No newline at end of file diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 5ced543..915ec39 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -1770,6 +1770,30 @@ static Value *EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF, // argument that specifies the vector type, need to handle each case. switch (BuiltinID) { default: break; + // Scalar Floating-point Multiply Extended + case AArch64::BI__builtin_neon_vmulxs_f32: + case AArch64::BI__builtin_neon_vmulxd_f64: { + Int = Intrinsic::aarch64_neon_vmulx; + llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType()); + return CGF.EmitNeonCall(CGF.CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); + } + case AArch64::BI__builtin_neon_vmul_n_f64: { + // Builtin has types v1f64 = (v1f64, f64) + // while the intrinsic has types f64 = f64, v1f64, 0) + std::swap(Ops[0], Ops[1]); + // Add imm 0 operand + llvm::Constant *CI = ConstantInt::get(CGF.Int64Ty, 0); + Ops.push_back(CI); + llvm::Type *Ty = Ops[0]->getType(); + llvm::Type *VTy = Ops[1]->getType(); + llvm::Type *Tys[3] = { Ty, Ty, VTy }; + Function *F = CGF.CGM.getIntrinsic(Intrinsic::aarch64_neon_vmullane, Tys); + s = "vmul_n"; + Value *Result = CGF.EmitNeonCall(F, Ops, s); + // cast result + llvm::Type *ResultType = CGF.ConvertType(E->getType()); + return CGF.Builder.CreateBitCast(Result, ResultType, s); + } case AArch64::BI__builtin_neon_vget_lane_i8: case AArch64::BI__builtin_neon_vget_lane_i16: case AArch64::BI__builtin_neon_vget_lane_i32: @@ -2004,11 +2028,6 @@ static Value *EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF, case AArch64::BI__builtin_neon_vqrdmulhs_s32: Int = Intrinsic::arm_neon_vqrdmulh; s = "vqrdmulh"; OverloadInt = true; break; - // Scalar Floating-point Multiply Extended - case AArch64::BI__builtin_neon_vmulxs_f32: - case AArch64::BI__builtin_neon_vmulxd_f64: - Int = Intrinsic::aarch64_neon_vmulx; - s = "vmulx"; OverloadInt = true; break; // Scalar Floating-point Reciprocal Step and case AArch64::BI__builtin_neon_vrecpss_f32: case AArch64::BI__builtin_neon_vrecpsd_f64: @@ -2839,7 +2858,48 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case AArch64::BI__builtin_neon_vsha256su1q_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256su1, Ty), Ops, "sha256su1"); - + case AArch64::BI__builtin_neon_vmul_lane_v: + case AArch64::BI__builtin_neon_vmul_laneq_v: { + // BI__builtin_neon_vmul_lane_v Builtin has + // types v1f64 = (v1f64, v8i8, i32, 8) + // while the intrinsic has types f64 = f64, v1f64, i64=0) + // first argument should be cast to f64 + // BI__builtin_neon_vmul_laneq_v Builtin has + // types v1f64 = (v1f64, v16i8, imm, 8) + // while the intrinsic has types f64 = f64, v2f64, imm0/imm1) + bool Quad = false; + if (AArch64::BI__builtin_neon_vmul_laneq_v == BuiltinID) + Quad = true; + Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); + // second argument should be cast to v1f64 or v2f64 if Quad + llvm::Type *FloatTy; + if (Quad) + FloatTy = GetNeonType(this, + NeonTypeFlags(NeonTypeFlags::Float64, false, true)); + else + FloatTy = Ty; + Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy); + // third argument should be i64 0 or i64 [0, 1] if Quad + int Lane = cast(Ops[2])->getZExtValue(); + if ((Quad && Lane != 0 && Lane != 1) || + (!Quad && Lane != 0)) + ErrorUnsupported(E, "range error builtin function"); + llvm::Constant *CI = ConstantInt::get(Int64Ty, Lane); + Ops[2] = CI; + llvm::Type *STy = Ops[0]->getType(); + llvm::Type *VTy = Ops[1]->getType(); + llvm::Type *Tys[3] = { STy, STy, VTy }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vmullane, Tys); + const char *s; + if (Quad) + s = "vmul_laneq"; + else + s ="vmul_lane"; + Value *Result = EmitNeonCall(F, Ops, s); + + // cast result + return Builder.CreateBitCast(Result, Ty, s); + } // AArch64-only builtins case AArch64::BI__builtin_neon_vfma_lane_v: case AArch64::BI__builtin_neon_vfmaq_laneq_v: { diff --git a/test/CodeGen/aarch64-neon-2velem.c b/test/CodeGen/aarch64-neon-2velem.c index f34e11a..c19e5bc 100644 --- a/test/CodeGen/aarch64-neon-2velem.c +++ b/test/CodeGen/aarch64-neon-2velem.c @@ -722,6 +722,14 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } + +float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { + // CHECK: test_vmul_lane_f64 + return vmul_lane_f64(a, v, 0); + // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +} + + float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK: test_vmulq_lane_f32 return vmulq_lane_f32(a, v, 1); @@ -740,6 +748,20 @@ float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { + // CHECK: test_vmul_laneq_f64_0 + return vmul_laneq_f64(a, v, 0); + // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +} + +float64x1_t test_vmul_laneq_f64_1(float64x1_t a, float64x2_t v) { + // CHECK: test_vmul_laneq_f64_1 + return vmul_laneq_f64(a, v, 1); + // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +} + + + float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK: test_vmulq_laneq_f32 return vmulq_laneq_f32(a, v, 1); diff --git a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c new file mode 100644 index 0000000..f7b4536 --- /dev/null +++ b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c @@ -0,0 +1,81 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \ +// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s + +// Test new aarch64 intrinsics and types + +#include + +float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) { + // CHECK: test_vmuls_lane_f32 + return vmuls_lane_f32(a, b, 1); + // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +} + +float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) { + // CHECK: test_vmuld_lane_f64 + return vmuld_lane_f64(a, b, 0); + // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +} + +float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) { + // CHECK: test_vmuls_laneq_f32 + return vmuls_laneq_f32(a, b, 3); + // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] +} + +float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) { + // CHECK: test_vmuld_laneq_f64 + return vmuld_laneq_f64(a, b, 1); + // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +} + +float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) { + // CHECK: test_vmul_n_f64 + return vmul_n_f64(a, b); + // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +} + +float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) { +// CHECK: test_vmulxs_lane_f32 + return vmulxs_lane_f32(a, b, 1); +// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +} + +float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) { +// CHECK: test_vmulxs_laneq_f32 + return vmulxs_laneq_f32(a, b, 3); +// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] +} + +float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) { +// CHECK: test_vmulxd_lane_f64 + return vmulxd_lane_f64(a, b, 0); +// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +} + +float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) { +// CHECK: test_vmulxd_laneq_f64 + return vmulxd_laneq_f64(a, b, 1); +// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +} + +// CHECK_AARCH64: test_vmulx_lane_f64 +float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) { + return vmulx_lane_f64(a, b, 0); + // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +} + + +// CHECK_AARCH64: test_vmulx_laneq_f64_0 +float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) { + return vmulx_laneq_f64(a, b, 0); + // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +} + +// CHECK_AARCH64: test_vmulx_laneq_f64_1 +float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) { + return vmulx_laneq_f64(a, b, 1); + // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +} + diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp index 4fd94a9..60cfae2 100644 --- a/utils/TableGen/NeonEmitter.cpp +++ b/utils/TableGen/NeonEmitter.cpp @@ -128,7 +128,13 @@ enum OpKind { OpMovlHi, OpCopyLane, OpCopyQLane, - OpCopyLaneQ + OpCopyLaneQ, + OpScalarMulLane, + OpScalarMulLaneQ, + OpScalarMulXLane, + OpScalarMulXLaneQ, + OpScalarVMulXLane, + OpScalarVMulXLaneQ }; enum ClassKind { @@ -282,6 +288,12 @@ public: OpMap["OP_COPY_LN"] = OpCopyLane; OpMap["OP_COPYQ_LN"] = OpCopyQLane; OpMap["OP_COPY_LNQ"] = OpCopyLaneQ; + OpMap["OP_SCALAR_MUL_LN"]= OpScalarMulLane; + OpMap["OP_SCALAR_MUL_LNQ"]= OpScalarMulLaneQ; + OpMap["OP_SCALAR_MULX_LN"]= OpScalarMulXLane; + OpMap["OP_SCALAR_MULX_LNQ"]= OpScalarMulXLaneQ; + OpMap["OP_SCALAR_VMULX_LN"]= OpScalarVMulXLane; + OpMap["OP_SCALAR_VMULX_LNQ"]= OpScalarVMulXLaneQ; Record *SI = R.getClass("SInst"); Record *II = R.getClass("IInst"); @@ -1928,6 +1940,79 @@ static std::string GenOpString(const std::string &name, OpKind op, "(__c1, __d1); \\\n vset_lane_" + typeCode + "(__c2, __a1, __b1);"; break; } + case OpScalarMulLane: { + std::string typeCode = ""; + InstructionTypeCode(typestr, ClassS, quad, typeCode); + s += TypeString('s', typestr) + " __d1 = vget_lane_" + typeCode + + "(__b, __c);\\\n __a * __d1;"; + break; + } + case OpScalarMulLaneQ: { + std::string typeCode = ""; + InstructionTypeCode(typestr, ClassS, quad, typeCode); + s += TypeString('s', typestr) + " __d1 = vgetq_lane_" + typeCode + + "(__b, __c);\\\n __a * __d1;"; + break; + } + case OpScalarMulXLane: { + bool dummy = false; + char type = ClassifyType(typestr, dummy, dummy, dummy); + if (type == 'f') type = 's'; + std::string typeCode = ""; + InstructionTypeCode(typestr, ClassS, quad, typeCode); + s += TypeString('s', typestr) + " __d1 = vget_lane_" + typeCode + + "(__b, __c);\\\n vmulx" + type + "_" + + typeCode + "(__a, __d1);"; + break; + } + case OpScalarMulXLaneQ: { + bool dummy = false; + char type = ClassifyType(typestr, dummy, dummy, dummy); + if (type == 'f') type = 's'; + std::string typeCode = ""; + InstructionTypeCode(typestr, ClassS, quad, typeCode); + s += TypeString('s', typestr) + " __d1 = vgetq_lane_" + typeCode + + "(__b, __c);\\\n vmulx" + type + + "_" + typeCode + "(__a, __d1);"; + break; + } + + case OpScalarVMulXLane: { + bool dummy = false; + char type = ClassifyType(typestr, dummy, dummy, dummy); + if (type == 'f') type = 's'; + std::string typeCode = ""; + InstructionTypeCode(typestr, ClassS, quad, typeCode); + s += TypeString('s', typestr) + " __d1 = vget_lane_" + + typeCode + "(__a, 0);\\\n" + + " " + TypeString('s', typestr) + " __e1 = vget_lane_" + + typeCode + "(__b, __c);\\\n" + + " " + TypeString('s', typestr) + " __f1 = vmulx" + type + "_" + + typeCode + "(__d1, __e1);\\\n" + + " " + TypeString('d', typestr) + " __g1;\\\n" + + " vset_lane_" + typeCode + "(__f1, __g1, __c);\\\n" + + " __g1;"; + break; + } + + case OpScalarVMulXLaneQ: { + bool dummy = false; + char type = ClassifyType(typestr, dummy, dummy, dummy); + if (type == 'f') type = 's'; + std::string typeCode = ""; + InstructionTypeCode(typestr, ClassS, quad, typeCode); + s += TypeString('s', typestr) + " __d1 = vget_lane_" + + typeCode + "(__a, 0);\\\n" + + " " + TypeString('s', typestr) + " __e1 = vgetq_lane_" + + typeCode + "(__b, __c);\\\n" + + " " + TypeString('s', typestr) + " __f1 = vmulx" + type + "_" + + typeCode + "(__d1, __e1);\\\n" + + " " + TypeString('d', typestr) + " __g1;\\\n" + + " vset_lane_" + typeCode + "(__f1, __g1, 0);\\\n" + + " __g1;"; + break; + } + default: PrintFatalError("unknown OpKind!"); } @@ -2548,6 +2633,11 @@ NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS, PrintFatalError(R->getLoc(), "Fixed point convert name should contains \"32\" or \"64\""); + } else if (R->getValueAsBit("isVMUL_LANEQ")) { + // VMUL_LANEQ is implemented as scalar + // vmul_lane which takes an immediate [0, 1]. + ck = ClassB; + rangestr = "l = 0; u = 1"; // upper bound = l + u } else if (R->getValueAsBit("isScalarShift")) { // Right shifts have an 'r' in the name, left shifts do not. Convert // instructions have the same bounds and right shifts.