[llvm-commits] [llvm] r112941 - in /llvm/trunk: include/llvm/IntrinsicsARM.td lib/Target/ARM/ARMISelLowering.cpp lib/Target/ARM/ARMInstrNEON.td lib/VMCore/AutoUpgrade.cpp test/Bitcode/neon-intrinsics.ll test/Bitcode/neon-intrinsics.ll.bc test/CodeGen/ARM/vaba.ll test/CodeGen/ARM/vabd.ll
Bob Wilson
bob.wilson at apple.com
Thu Sep 2 18:35:08 PDT 2010
Author: bwilson
Date: Thu Sep 2 20:35:08 2010
New Revision: 112941
URL: http://llvm.org/viewvc/llvm-project?rev=112941&view=rev
Log:
Replace NEON vabdl, vaba, and vabal intrinsics with combinations of the
vabd intrinsic and add and/or zext operations. In the case of vaba, this
also avoids the need for a DAG combine pattern to combine vabd with add.
Update tests. Auto-upgrade the old intrinsics.
Modified:
llvm/trunk/include/llvm/IntrinsicsARM.td
llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
llvm/trunk/lib/VMCore/AutoUpgrade.cpp
llvm/trunk/test/Bitcode/neon-intrinsics.ll
llvm/trunk/test/Bitcode/neon-intrinsics.ll.bc
llvm/trunk/test/CodeGen/ARM/vaba.ll
llvm/trunk/test/CodeGen/ARM/vabd.ll
Modified: llvm/trunk/include/llvm/IntrinsicsARM.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IntrinsicsARM.td?rev=112941&r1=112940&r2=112941&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IntrinsicsARM.td (original)
+++ llvm/trunk/include/llvm/IntrinsicsARM.td Thu Sep 2 20:35:08 2010
@@ -176,14 +176,6 @@
// Vector Absolute Differences.
def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
def int_arm_neon_vabdu : Neon_2Arg_Intrinsic;
-def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic;
-def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic;
-
-// Vector Absolute Difference and Accumulate.
-def int_arm_neon_vabas : Neon_3Arg_Intrinsic;
-def int_arm_neon_vabau : Neon_3Arg_Intrinsic;
-def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic;
-def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic;
// Vector Pairwise Add.
def int_arm_neon_vpadd : Neon_2Arg_Intrinsic;
Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=112941&r1=112940&r2=112941&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Thu Sep 2 20:35:08 2010
@@ -4293,28 +4293,11 @@
/// operands.
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
TargetLowering::DAGCombinerInfo &DCI) {
- SelectionDAG &DAG = DCI.DAG;
-
// fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
if (Result.getNode()) return Result;
}
-
- // fold (add (arm_neon_vabd a, b) c) -> (arm_neon_vaba c, a, b)
- EVT VT = N->getValueType(0);
- if (N0.getOpcode() == ISD::INTRINSIC_WO_CHAIN && VT.isInteger()) {
- unsigned IntNo = cast<ConstantSDNode>(N0.getOperand(0))->getZExtValue();
- if (IntNo == Intrinsic::arm_neon_vabds)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), VT,
- DAG.getConstant(Intrinsic::arm_neon_vabas, MVT::i32),
- N1, N0.getOperand(1), N0.getOperand(2));
- if (IntNo == Intrinsic::arm_neon_vabdu)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), VT,
- DAG.getConstant(Intrinsic::arm_neon_vabau, MVT::i32),
- N1, N0.getOperand(1), N0.getOperand(2));
- }
-
return SDValue();
}
Modified: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrNEON.td?rev=112941&r1=112940&r2=112941&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td Thu Sep 2 20:35:08 2010
@@ -1288,6 +1288,24 @@
(ResTy (NEONvduplane (OpTy DPR_8:$src3),
imm:$lane)))))))]>;
+// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
+class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
+ OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+ [(set DPR:$dst, (Ty (OpNode DPR:$src1,
+ (Ty (IntOp (Ty DPR:$src2), (Ty DPR:$src3))))))]>;
+class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
+ OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+ [(set QPR:$dst, (Ty (OpNode QPR:$src1,
+ (Ty (IntOp (Ty QPR:$src2), (Ty QPR:$src3))))))]>;
+
// Neon 3-argument intrinsics, both double- and quad-register.
// The destination register is also used as the first source operand register.
class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -1342,6 +1360,17 @@
(TyD (NEONvduplane (TyD DPR_8:$src3),
imm:$lane))))))]>;
+// Long Intrinsic-Op vector operations with explicit extend (VABAL).
+class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+ SDNode OpNode>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
+ OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+ [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
+ (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src2),
+ (TyD DPR:$src3)))))))]>;
// Neon Long 3-argument intrinsic. The destination register is
// a quad-register and is also used as the first source operand register.
@@ -1433,6 +1462,19 @@
let isCommutable = Commutable;
}
+// Long 3-register intrinsics with explicit extend (VABDL).
+class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+ bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
+ OpcodeStr, Dt, "$dst, $src1, $src2", "",
+ [(set QPR:$dst, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src1),
+ (TyD DPR:$src2))))))]> {
+ let isCommutable = Commutable;
+}
+
// Long 3-register intrinsics.
class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
InstrItinClass itin, string OpcodeStr, string Dt,
@@ -1918,6 +1960,21 @@
v8i16, v8i8, IntOp, Commutable>;
}
+// ....with explicit extend (VABDL).
+multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> {
+ def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i16, v8i8, IntOp, ExtOp, Commutable>;
+ def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i32, v4i16, IntOp, ExtOp, Commutable>;
+ def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i64, v2i32, IntOp, ExtOp, Commutable>;
+}
+
// Neon Wide 3-register vector intrinsics,
// source operand element sizes of 8, 16 and 32 bits:
@@ -1975,6 +2032,29 @@
mul, ShOp>;
}
+// Neon Intrinsic-Op vector operations,
+// element sizes of 8, 16 and 32 bits:
+multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD, InstrItinClass itinQ,
+ string OpcodeStr, string Dt, Intrinsic IntOp,
+ SDNode OpNode> {
+ // 64-bit vector types.
+ def v8i8 : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
+ OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>;
+ def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD,
+ OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>;
+ def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD,
+ OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>;
+
+ // 128-bit vector types.
+ def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ,
+ OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>;
+ def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ,
+ OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>;
+ def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ,
+ OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>;
+}
+
// Neon 3-argument intrinsics,
// element sizes of 8, 16 and 32 bits:
multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
@@ -2050,6 +2130,21 @@
OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
}
+// ....with explicit extend (VABAL).
+multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> {
+ def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
+ IntOp, ExtOp, OpNode>;
+ def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16,
+ IntOp, ExtOp, OpNode>;
+ def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin,
+ OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32,
+ IntOp, ExtOp, OpNode>;
+}
+
// Neon 2-register vector intrinsics,
// element sizes of 8, 16 and 32 bits:
@@ -2765,32 +2860,32 @@
// VABD : Vector Absolute Difference
defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vabd", "s", int_arm_neon_vabds, 0>;
+ "vabd", "s", int_arm_neon_vabds, 1>;
defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vabd", "u", int_arm_neon_vabdu, 0>;
+ "vabd", "u", int_arm_neon_vabdu, 1>;
def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
- "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>;
+ "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
- "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>;
+ "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
// VABDL : Vector Absolute Difference Long (Q = | D - D |)
-defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vabdl", "s", int_arm_neon_vabdls, 0>;
-defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vabdl", "u", int_arm_neon_vabdlu, 0>;
+defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
+ "vabdl", "s", int_arm_neon_vabds, zext, 1>;
+defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
+ "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
// VABA : Vector Absolute Difference and Accumulate
-defm VABAs : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
- "vaba", "s", int_arm_neon_vabas>;
-defm VABAu : N3VInt3_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
- "vaba", "u", int_arm_neon_vabau>;
+defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+ "vaba", "s", int_arm_neon_vabds, add>;
+defm VABAu : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+ "vaba", "u", int_arm_neon_vabdu, add>;
// VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
-defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, IIC_VABAD, IIC_VABAD,
- "vabal", "s", int_arm_neon_vabals>;
-defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, IIC_VABAD, IIC_VABAD,
- "vabal", "u", int_arm_neon_vabalu>;
+defm VABALs : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD,
+ "vabal", "s", int_arm_neon_vabds, zext, add>;
+defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
+ "vabal", "u", int_arm_neon_vabdu, zext, add>;
// Vector Maximum and Minimum.
Modified: llvm/trunk/lib/VMCore/AutoUpgrade.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/VMCore/AutoUpgrade.cpp?rev=112941&r1=112940&r2=112941&view=diff
==============================================================================
--- llvm/trunk/lib/VMCore/AutoUpgrade.cpp (original)
+++ llvm/trunk/lib/VMCore/AutoUpgrade.cpp Thu Sep 2 20:35:08 2010
@@ -81,21 +81,21 @@
} else if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
if (((Name.compare(14, 5, "vmovl", 5) == 0 ||
Name.compare(14, 5, "vaddl", 5) == 0 ||
- Name.compare(14, 5, "vsubl", 5) == 0) &&
- (Name.compare(19, 2, "s.", 2) == 0 ||
- Name.compare(19, 2, "u.", 2) == 0)) ||
-
- ((Name.compare(14, 5, "vaddw", 5) == 0 ||
- Name.compare(14, 5, "vsubw", 5) == 0) &&
- (Name.compare(19, 2, "s.", 2) == 0 ||
- Name.compare(19, 2, "u.", 2) == 0)) ||
-
- ((Name.compare(14, 5, "vmull", 5) == 0 ||
+ Name.compare(14, 5, "vsubl", 5) == 0 ||
+ Name.compare(14, 5, "vaddw", 5) == 0 ||
+ Name.compare(14, 5, "vsubw", 5) == 0 ||
+ Name.compare(14, 5, "vmull", 5) == 0 ||
Name.compare(14, 5, "vmlal", 5) == 0 ||
- Name.compare(14, 5, "vmlsl", 5) == 0) &&
+ Name.compare(14, 5, "vmlsl", 5) == 0 ||
+ Name.compare(14, 5, "vabdl", 5) == 0 ||
+ Name.compare(14, 5, "vabal", 5) == 0) &&
(Name.compare(19, 2, "s.", 2) == 0 ||
Name.compare(19, 2, "u.", 2) == 0)) ||
+ (Name.compare(14, 4, "vaba", 4) == 0 &&
+ (Name.compare(18, 2, "s.", 2) == 0 ||
+ Name.compare(18, 2, "u.", 2) == 0)) ||
+
(Name.compare(14, 6, "vmovn.", 6) == 0)) {
// Calls to these are transformed into IR without intrinsics.
@@ -391,6 +391,35 @@
}
}
+/// CallVABD - As part of expanding a call to one of the old NEON vabdl, vaba,
+/// or vabal intrinsics, construct a call to a vabd intrinsic. Examine the
+/// name of the old intrinsic to determine whether to use a signed or unsigned
+/// vabd intrinsic. Get the type from the old call instruction, adjusted for
+/// half-size vector elements if the old intrinsic was vabdl or vabal.
+static Instruction *CallVABD(CallInst *CI, Value *Arg0, Value *Arg1) {
+ Function *F = CI->getCalledFunction();
+ const std::string& Name = F->getName();
+ bool isLong = (Name.at(18) == 'l');
+ bool isSigned = (Name.at(isLong ? 19 : 18) == 's');
+
+ Intrinsic::ID intID;
+ if (isSigned)
+ intID = Intrinsic::arm_neon_vabds;
+ else
+ intID = Intrinsic::arm_neon_vabdu;
+
+ const Type *Ty = CI->getType();
+ if (isLong)
+ Ty = VectorType::getTruncatedElementVectorType(cast<const VectorType>(Ty));
+
+ Function *VABD = Intrinsic::getDeclaration(F->getParent(), intID, &Ty, 1);
+ Value *Operands[2];
+ Operands[0] = Arg0;
+ Operands[1] = Arg1;
+ return CallInst::Create(VABD, Operands, Operands+2,
+ "upgraded."+CI->getName(), CI);
+}
+
// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
// upgraded intrinsic. All argument and return casting must be provided in
// order to seamlessly integrate with existing context.
@@ -434,6 +463,15 @@
Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
NewI = BinaryOperator::CreateSub(CI->getArgOperand(0), MulI,
"upgraded."+CI->getName(), CI);
+ } else if (Name.compare(14, 4, "vabd", 4) == 0) {
+ NewI = CallVABD(CI, CI->getArgOperand(0), CI->getArgOperand(1));
+ NewI = new ZExtInst(NewI, CI->getType(), "upgraded."+CI->getName(), CI);
+ } else if (Name.compare(14, 4, "vaba", 4) == 0) {
+ NewI = CallVABD(CI, CI->getArgOperand(1), CI->getArgOperand(2));
+ if (Name.at(18) == 'l')
+ NewI = new ZExtInst(NewI, CI->getType(), "", CI);
+ NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), NewI,
+ "upgraded."+CI->getName(), CI);
} else if (Name.compare(14, 6, "vmovn.", 6) == 0) {
NewI = new TruncInst(CI->getArgOperand(0), CI->getType(),
"upgraded." + CI->getName(), CI);
@@ -675,7 +713,7 @@
}
switch (NewFn->getIntrinsicID()) {
- default: llvm_unreachable("Unknown function for CallInst upgrade.");
+ default: llvm_unreachable("Unknown function for CallInst upgrade.");
case Intrinsic::arm_neon_vld1:
case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
Modified: llvm/trunk/test/Bitcode/neon-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Bitcode/neon-intrinsics.ll?rev=112941&r1=112940&r2=112941&view=diff
==============================================================================
--- llvm/trunk/test/Bitcode/neon-intrinsics.ll (original)
+++ llvm/trunk/test/Bitcode/neon-intrinsics.ll Thu Sep 2 20:35:08 2010
@@ -126,6 +126,44 @@
; CHECK-NEXT: mul <2 x i64>
; CHECK-NEXT: sub <2 x i64>
+; vaba should be auto-upgraded to vabd + add
+
+; CHECK: vabas32
+; CHECK-NOT: arm.neon.vabas.v2i32
+; CHECK: arm.neon.vabds.v2i32
+; CHECK-NEXT: add <2 x i32>
+
+; CHECK: vabaQu8
+; CHECK-NOT: arm.neon.vabau.v16i8
+; CHECK: arm.neon.vabdu.v16i8
+; CHECK-NEXT: add <16 x i8>
+
+; vabal should be auto-upgraded to vabd with zext + add
+
+; CHECK: vabals16
+; CHECK-NOT: arm.neon.vabals.v4i32
+; CHECK: arm.neon.vabds.v4i16
+; CHECK-NEXT: zext <4 x i16>
+; CHECK-NEXT: add <4 x i32>
+
+; CHECK: vabalu32
+; CHECK-NOT: arm.neon.vabalu.v2i64
+; CHECK: arm.neon.vabdu.v2i32
+; CHECK-NEXT: zext <2 x i32>
+; CHECK-NEXT: add <2 x i64>
+
+; vabdl should be auto-upgraded to vabd with zext
+
+; CHECK: vabdls8
+; CHECK-NOT: arm.neon.vabdls.v8i16
+; CHECK: arm.neon.vabds.v8i8
+; CHECK-NEXT: zext <8 x i8>
+
+; CHECK: vabdlu16
+; CHECK-NOT: arm.neon.vabdlu.v4i32
+; CHECK: arm.neon.vabdu.v4i16
+; CHECK-NEXT: zext <4 x i16>
+
; vmovn should be auto-upgraded to trunc
; CHECK: vmovni16
Modified: llvm/trunk/test/Bitcode/neon-intrinsics.ll.bc
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Bitcode/neon-intrinsics.ll.bc?rev=112941&r1=112940&r2=112941&view=diff
==============================================================================
Binary files - no diff available.
Modified: llvm/trunk/test/CodeGen/ARM/vaba.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vaba.ll?rev=112941&r1=112940&r2=112941&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/vaba.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/vaba.ll Thu Sep 2 20:35:08 2010
@@ -6,8 +6,9 @@
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
- %tmp4 = call <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
- ret <8 x i8> %tmp4
+ %tmp4 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3)
+ %tmp5 = add <8 x i8> %tmp1, %tmp4
+ ret <8 x i8> %tmp5
}
define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
@@ -16,8 +17,9 @@
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
- %tmp4 = call <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
- ret <4 x i16> %tmp4
+ %tmp4 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3)
+ %tmp5 = add <4 x i16> %tmp1, %tmp4
+ ret <4 x i16> %tmp5
}
define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
@@ -26,8 +28,9 @@
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
- %tmp4 = call <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
- ret <2 x i32> %tmp4
+ %tmp4 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3)
+ %tmp5 = add <2 x i32> %tmp1, %tmp4
+ ret <2 x i32> %tmp5
}
define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
@@ -36,8 +39,9 @@
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
- %tmp4 = call <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
- ret <8 x i8> %tmp4
+ %tmp4 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3)
+ %tmp5 = add <8 x i8> %tmp1, %tmp4
+ ret <8 x i8> %tmp5
}
define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
@@ -46,8 +50,9 @@
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
- %tmp4 = call <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
- ret <4 x i16> %tmp4
+ %tmp4 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3)
+ %tmp5 = add <4 x i16> %tmp1, %tmp4
+ ret <4 x i16> %tmp5
}
define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
@@ -56,8 +61,9 @@
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
- %tmp4 = call <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
- ret <2 x i32> %tmp4
+ %tmp4 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3)
+ %tmp5 = add <2 x i32> %tmp1, %tmp4
+ ret <2 x i32> %tmp5
}
define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
@@ -66,8 +72,9 @@
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
- %tmp4 = call <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
- ret <16 x i8> %tmp4
+ %tmp4 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp2, <16 x i8> %tmp3)
+ %tmp5 = add <16 x i8> %tmp1, %tmp4
+ ret <16 x i8> %tmp5
}
define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
@@ -76,8 +83,9 @@
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
- %tmp4 = call <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
- ret <8 x i16> %tmp4
+ %tmp4 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp3)
+ %tmp5 = add <8 x i16> %tmp1, %tmp4
+ ret <8 x i16> %tmp5
}
define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
@@ -86,8 +94,9 @@
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
- %tmp4 = call <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
- ret <4 x i32> %tmp4
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3)
+ %tmp5 = add <4 x i32> %tmp1, %tmp4
+ ret <4 x i32> %tmp5
}
define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
@@ -96,8 +105,9 @@
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
- %tmp4 = call <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
- ret <16 x i8> %tmp4
+ %tmp4 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp2, <16 x i8> %tmp3)
+ %tmp5 = add <16 x i8> %tmp1, %tmp4
+ ret <16 x i8> %tmp5
}
define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
@@ -106,8 +116,9 @@
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
- %tmp4 = call <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
- ret <8 x i16> %tmp4
+ %tmp4 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp3)
+ %tmp5 = add <8 x i16> %tmp1, %tmp4
+ ret <8 x i16> %tmp5
}
define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
@@ -116,25 +127,26 @@
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
- %tmp4 = call <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
- ret <4 x i32> %tmp4
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3)
+ %tmp5 = add <4 x i32> %tmp1, %tmp4
+ ret <4 x i32> %tmp5
}
-declare <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vabals8:
@@ -142,8 +154,10 @@
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
- %tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
- ret <8 x i16> %tmp4
+ %tmp4 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3)
+ %tmp5 = zext <8 x i8> %tmp4 to <8 x i16>
+ %tmp6 = add <8 x i16> %tmp1, %tmp5
+ ret <8 x i16> %tmp6
}
define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
@@ -152,8 +166,10 @@
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
- %tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
- ret <4 x i32> %tmp4
+ %tmp4 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3)
+ %tmp5 = zext <4 x i16> %tmp4 to <4 x i32>
+ %tmp6 = add <4 x i32> %tmp1, %tmp5
+ ret <4 x i32> %tmp6
}
define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
@@ -162,8 +178,10 @@
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
- %tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
- ret <2 x i64> %tmp4
+ %tmp4 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3)
+ %tmp5 = zext <2 x i32> %tmp4 to <2 x i64>
+ %tmp6 = add <2 x i64> %tmp1, %tmp5
+ ret <2 x i64> %tmp6
}
define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
@@ -172,8 +190,10 @@
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
- %tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
- ret <8 x i16> %tmp4
+ %tmp4 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3)
+ %tmp5 = zext <8 x i8> %tmp4 to <8 x i16>
+ %tmp6 = add <8 x i16> %tmp1, %tmp5
+ ret <8 x i16> %tmp6
}
define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
@@ -182,8 +202,10 @@
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
- %tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
- ret <4 x i32> %tmp4
+ %tmp4 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3)
+ %tmp5 = zext <4 x i16> %tmp4 to <4 x i32>
+ %tmp6 = add <4 x i32> %tmp1, %tmp5
+ ret <4 x i32> %tmp6
}
define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
@@ -192,38 +214,8 @@
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
- %tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
- ret <2 x i64> %tmp4
-}
-
-declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
-
-define <8 x i8> @vabd_combine_s8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK: vabd_combine_s8:
-;CHECK: vaba.s8
- %tmp1 = load <8 x i8>* %A
- %tmp2 = load <8 x i8>* %B
- %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
- %tmp4 = add <8 x i8> %tmp2, %tmp3
- ret <8 x i8> %tmp4
+ %tmp4 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3)
+ %tmp5 = zext <2 x i32> %tmp4 to <2 x i64>
+ %tmp6 = add <2 x i64> %tmp1, %tmp5
+ ret <2 x i64> %tmp6
}
-
-define <4 x i16> @vabd_combine_u16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK: vabd_combine_u16:
-;CHECK: vaba.u16
- %tmp1 = load <4 x i16>* %A
- %tmp2 = load <4 x i16>* %B
- %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
- %tmp4 = add <4 x i16> %tmp3, %tmp1
- ret <4 x i16> %tmp4
-}
-
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-
Modified: llvm/trunk/test/CodeGen/ARM/vabd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vabd.ll?rev=112941&r1=112940&r2=112941&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/vabd.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/vabd.ll Thu Sep 2 20:35:08 2010
@@ -151,8 +151,9 @@
;CHECK: vabdl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
- ret <8 x i16> %tmp3
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
}
define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
@@ -160,8 +161,9 @@
;CHECK: vabdl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
- ret <4 x i32> %tmp3
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
@@ -169,8 +171,9 @@
;CHECK: vabdl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
- ret <2 x i64> %tmp3
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
}
define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -178,8 +181,9 @@
;CHECK: vabdl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
- ret <8 x i16> %tmp3
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
}
define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
@@ -187,8 +191,9 @@
;CHECK: vabdl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
- ret <4 x i32> %tmp3
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
@@ -196,14 +201,7 @@
;CHECK: vabdl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
- ret <2 x i64> %tmp3
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
}
-
-declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
More information about the llvm-commits
mailing list