commit 8c710523c72d7493480c97957747011e3b8f2752 Author: Ana Pazos Date: Wed Nov 13 12:07:08 2013 -0800 Implemented aarch64 vmul_lane intrinsics Implemented aarch64 vmulx_lane intrinsics Implemented aarch64 vmul_n_f64 intrinsic, mapping it to Neon scalar operation Implemented aarch64 vmul_lane_f64 and vmul_laneq_f64 intrinsics, mapping them to Neon scalar operation Added codegen patterns for scalar copy (DUP) with FP types Added Scalar Copy (DUP) MOV aliases Assumed the following prototypes (ACLE document version 5): float64x1_t vmul_n_f64(float64x1_t a, float64_t b) float64x1_t vmul_lane_f64(float64x1_t a, float64x1_t b) float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t b) To force vmul_n_f64, vmul_lane_f64, vmul_laneq_f64 intrinsics to map to Neon scalar operations an aarch64 IR intrinsic was created. Otherwise the casting of result to float64x1_t might cause non-vector code to be created and a different instruction to be invoked. diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td index 4d54a23..78860f3 100644 --- a/include/llvm/IR/IntrinsicsAArch64.td +++ b/include/llvm/IR/IntrinsicsAArch64.td @@ -34,8 +34,9 @@ def int_aarch64_neon_vpmaxnm : Neon_2Arg_Intrinsic; // Vector Pairwise minNum (Floating Point) def int_aarch64_neon_vpminnm : Neon_2Arg_Intrinsic; -// Vector Multiply Extended (Floating Point) -def int_aarch64_neon_vmulx : Neon_2Arg_Intrinsic; +// Vector Multiply Extended and Scalar Multiply Extended (Floating Point) +def int_aarch64_neon_vmulx : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; class Neon_N2V_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty], @@ -291,4 +292,8 @@ class Neon_SHA_Intrinsic def int_aarch64_neon_sha1c : Neon_SHA_Intrinsic; def int_aarch64_neon_sha1m : Neon_SHA_Intrinsic; def int_aarch64_neon_sha1p : Neon_SHA_Intrinsic; + +// Scalar Floating Point multiply (scalar, by element) +def int_aarch64_neon_vmullane : + Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyvector_ty, llvm_i64_ty], [IntrNoMem]>; } diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index 83bb1fa..0b3c015 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -4339,8 +4339,17 @@ defm : Neon_Scalar3Same_SD_size_patterns; +multiclass Neon_Scalar3Same_MULX_SD_size_patterns { + def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +defm : Neon_Scalar3Same_MULX_SD_size_patterns; // Scalar Integer Shift Left (Signed, Unsigned) def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">; @@ -4764,6 +4773,74 @@ def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx", let Inst{20-16} = MRm; } +multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns< + SDPatternOperator opnode, + Instruction INST, + ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped operands + def : Pat<(ResTy (opnode + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Rn))), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Rn))), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; +} + +multiclass Neon_ScalarXIndexedElem_VMULLANE_Intrinsic_Patterns< + SDPatternOperator opnode, + Instruction INST, + ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for Scalar Floating Point multiply (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Floating Point multiply (scalar, by element) +defm : Neon_ScalarXIndexedElem_VMULLANE_Intrinsic_Patterns< + int_aarch64_neon_vmullane, FMULddv_2D, + f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>; + +// Patterns for Scalar Floating Point multiply extended (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; + + // Scalar Floating Point fused multiply-add (scalar, by element) def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla", 0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { @@ -4960,6 +5037,40 @@ def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> { let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; } +multiclass NeonI_Scalar_DUP_Elt_pattern { + + def : Pat<(ResTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for vector extract of FP data using scalar DUP instructions +defm : NeonI_Scalar_DUP_Elt_pattern; +defm : NeonI_Scalar_DUP_Elt_pattern; + +multiclass NeonI_Scalar_DUP_alias { + def : NeonInstAlias; +} + +// Aliases for Scalar copy - DUP element (scalar) +// FIXME: This is actually the preferred syntax but TableGen can't deal with +// custom printing of aliases. +defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>; +defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>; +defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>; +defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>; + //===----------------------------------------------------------------------===// // Non-Instruction Patterns diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll new file mode 100644 index 0000000..5ff3267 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll @@ -0,0 +1,150 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) { + ; CHECK: test_fmul_lane_ss2S + ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = fmul float %a, %tmp1; + ret float %tmp2; +} + +define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) { + ; CHECK: test_fmul_lane_ss2S_swap + ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = fmul float %tmp1, %a; + ret float %tmp2; +} + + +define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) { + ; CHECK: test_fmul_lane_ss4S + ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = fmul float %a, %tmp1; + ret float %tmp2; +} + +define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) { + ; CHECK: test_fmul_lane_ss4S_swap + ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = fmul float %tmp1, %a; + ret float %tmp2; +} + + +define double @test_fmul_lane_ddD(double %a, <1 x double> %v) { + ; CHECK: test_fmul_lane_ddD + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = fmul double %a, %tmp1; + ret double %tmp2; +} + + + +define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) { + ; CHECK: test_fmul_lane_dd2D + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = fmul double %a, %tmp1; + ret double %tmp2; +} + + +define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) { + ; CHECK: test_fmul_lane_dd2D_swap + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = fmul double %tmp1, %a; + ret double %tmp2; +} + +declare double @llvm.aarch64.neon.vmullane.f64.f64.v1f64(double, <1 x double>, i64) + +define double @test_vmullane_D(double %a, <1 x double> %v) { + ; CHECK: @test_vmullane_D + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp1 = call double @llvm.aarch64.neon.vmullane.f64.f64.v1f64(double %a, <1 x double> %v, i64 0); + ret double %tmp1; +} + +declare double @llvm.aarch64.neon.vmullane.f64.f64.v2f64(double, <2 x double>, i64) + +define double @test_vmullane_2D_0(double %a, <2 x double> %v) { + ; CHECK: test_vmullane_2D_0 + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp1 = call double @llvm.aarch64.neon.vmullane.f64.f64.v2f64(double %a, <2 x double> %v, i64 0); + ret double %tmp1; +} + +define double @test_vmullane_2D_1(double %a, <2 x double> %v) { + ; CHECK: test_vmullane_2D_1 + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = call double @llvm.aarch64.neon.vmullane.f64.f64.v2f64(double %a, <2 x double> %v, i64 1); + ret double %tmp1; +} + + +declare float @llvm.aarch64.neon.vmulx.f32(float, float) + +define float @test_fmulx_lane_f32(float %a, <2 x float> %v) { + ; CHECK: test_fmulx_lane_f32 + ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1) + ret float %tmp2; +} + +define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) { + ; CHECK: test_fmulx_laneq_f32 + ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1) + ret float %tmp2; +} + +define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) { + ; CHECK: test_fmulx_laneq_f32_swap + ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a) + ret float %tmp2; +} + +declare double @llvm.aarch64.neon.vmulx.f64(double, double) + +define double @test_fmulx_lane_f64(double %a, <1 x double> %v) { + ; CHECK: test_fmulx_lane_f64 + ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) + ret double %tmp2; +} + +define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) { + ; CHECK: test_fmulx_laneq_f64_0 + ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp1 = extractelement <2 x double> %v, i32 0 + %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) + ret double %tmp2; +} + + +define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) { + ; CHECK: test_fmulx_laneq_f64_1 + ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) + ret double %tmp2; +} + +define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) { + ; CHECK: test_fmulx_laneq_f64_1_swap + ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a) + ret double %tmp2; +} + diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll index a58294b..4992a51 100644 --- a/test/CodeGen/AArch64/neon-scalar-mul.ll +++ b/test/CodeGen/AArch64/neon-scalar-mul.ll @@ -49,25 +49,19 @@ declare <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32>, <1 x i32>) define float @test_vmulxs_f32(float %a, float %b) { ; CHECK: test_vmulxs_f32 ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} - %1 = insertelement <1 x float> undef, float %a, i32 0 - %2 = insertelement <1 x float> undef, float %b, i32 0 - %3 = call <1 x float> @llvm.aarch64.neon.vmulx.v1f32(<1 x float> %1, <1 x float> %2) - %4 = extractelement <1 x float> %3, i32 0 - ret float %4 + %1 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %b) + ret float %1 } define double @test_vmulxd_f64(double %a, double %b) { ; CHECK: test_vmulxd_f64 ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} - %1 = insertelement <1 x double> undef, double %a, i32 0 - %2 = insertelement <1 x double> undef, double %b, i32 0 - %3 = call <1 x double> @llvm.aarch64.neon.vmulx.v1f64(<1 x double> %1, <1 x double> %2) - %4 = extractelement <1 x double> %3, i32 0 - ret double %4 + %1 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %b) + ret double %1 } -declare <1 x float> @llvm.aarch64.neon.vmulx.v1f32(<1 x float>, <1 x float>) -declare <1 x double> @llvm.aarch64.neon.vmulx.v1f64(<1 x double>, <1 x double>) +declare float @llvm.aarch64.neon.vmulx.f32(float, float) +declare double @llvm.aarch64.neon.vmulx.f64(double, double) define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) { ; CHECK: test_vqdmlalh_s16