<div class="gmail_quote">On 1 September 2010 16:50, Bob Wilson <span dir="ltr"><<a href="mailto:bob.wilson@apple.com">bob.wilson@apple.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex;">
Author: bwilson<br>
Date: Wed Sep 1 18:50:19 2010<br>
New Revision: 112773<br>
<br>
URL: <a href="http://llvm.org/viewvc/llvm-project?rev=112773&view=rev" target="_blank">http://llvm.org/viewvc/llvm-project?rev=112773&view=rev</a><br>
Log:<br>
Remove NEON vmull, vmlal, and vmlsl intrinsics, replacing them with multiply,<br>
add, and subtract operations with zero-extended or sign-extended vectors.<br>
Update tests. Add auto-upgrade support for the old intrinsics.<br></blockquote><div><br></div><div>Fantastic! Thanks Bob!</div><div><br></div><div>Nick</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex;">
Modified:<br>
llvm/trunk/include/llvm/IntrinsicsARM.td<br>
llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp<br>
llvm/trunk/lib/Target/ARM/ARMISelLowering.h<br>
llvm/trunk/lib/Target/ARM/ARMInstrNEON.td<br>
llvm/trunk/lib/VMCore/AutoUpgrade.cpp<br>
llvm/trunk/test/Bitcode/neon-intrinsics.ll<br>
llvm/trunk/test/Bitcode/neon-intrinsics.ll.bc<br>
llvm/trunk/test/CodeGen/ARM/vmla.ll<br>
llvm/trunk/test/CodeGen/ARM/vmls.ll<br>
llvm/trunk/test/CodeGen/ARM/vmul.ll<br>
<br>
Modified: llvm/trunk/include/llvm/IntrinsicsARM.td<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IntrinsicsARM.td?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IntrinsicsARM.td?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/include/llvm/IntrinsicsARM.td (original)<br>
+++ llvm/trunk/include/llvm/IntrinsicsARM.td Wed Sep 1 18:50:19 2010<br>
@@ -129,16 +129,8 @@<br>
def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;<br>
def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;<br>
def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;<br>
- def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;<br>
- def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;<br>
def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;<br>
def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;<br>
-<br>
- // Vector Multiply and Accumulate/Subtract.<br>
- def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic;<br>
- def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic;<br>
- def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic;<br>
- def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic;<br>
def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;<br>
def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;<br>
<br>
@@ -302,7 +294,7 @@<br>
def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic;<br>
def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic;<br>
<br>
-// Narrowing and Lengthening Vector Moves.<br>
+// Narrowing Saturating Vector Moves.<br>
def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;<br>
def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;<br>
def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;<br>
<br>
Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)<br>
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Wed Sep 1 18:50:19 2010<br>
@@ -326,7 +326,10 @@<br>
<br>
// Neon does not support some operations on v1i64 and v2i64 types.<br>
setOperationAction(ISD::MUL, MVT::v1i64, Expand);<br>
- setOperationAction(ISD::MUL, MVT::v2i64, Expand);<br>
+ // Custom handling for some quad-vector types to detect VMULL.<br>
+ setOperationAction(ISD::MUL, MVT::v8i16, Custom);<br>
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);<br>
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);<br>
setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);<br>
setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);<br>
<br>
@@ -684,6 +687,8 @@<br>
case ARMISD::VZIP: return "ARMISD::VZIP";<br>
case ARMISD::VUZP: return "ARMISD::VUZP";<br>
case ARMISD::VTRN: return "ARMISD::VTRN";<br>
+ case ARMISD::VMULLs: return "ARMISD::VMULLs";<br>
+ case ARMISD::VMULLu: return "ARMISD::VMULLu";<br>
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";<br>
case ARMISD::FMAX: return "ARMISD::FMAX";<br>
case ARMISD::FMIN: return "ARMISD::FMIN";<br>
@@ -3751,6 +3756,51 @@<br>
return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val);<br>
}<br>
<br>
+/// SkipExtension - For a node that is either a SIGN_EXTEND, ZERO_EXTEND, or<br>
+/// an extending load, return the unextended value.<br>
+static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {<br>
+ if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)<br>
+ return N->getOperand(0);<br>
+ LoadSDNode *LD = cast<LoadSDNode>(N);<br>
+ return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),<br>
+ LD->getBasePtr(), LD->getSrcValue(),<br>
+ LD->getSrcValueOffset(), LD->isVolatile(),<br>
+ LD->isNonTemporal(), LD->getAlignment());<br>
+}<br>
+<br>
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {<br>
+ // Multiplications are only custom-lowered for 128-bit vectors so that<br>
+ // VMULL can be detected. Otherwise v2i64 multiplications are not legal.<br>
+ EVT VT = Op.getValueType();<br>
+ assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");<br>
+ SDNode *N0 = Op.getOperand(0).getNode();<br>
+ SDNode *N1 = Op.getOperand(1).getNode();<br>
+ unsigned NewOpc = 0;<br>
+ if ((N0->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N0)) &&<br>
+ (N1->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N1))) {<br>
+ NewOpc = ARMISD::VMULLs;<br>
+ } else if ((N0->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N0)) &&<br>
+ (N1->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N1))) {<br>
+ NewOpc = ARMISD::VMULLu;<br>
+ } else if (VT.getSimpleVT().SimpleTy == MVT::v2i64) {<br>
+ // Fall through to expand this. It is not legal.<br>
+ return SDValue();<br>
+ } else {<br>
+ // Other vector multiplications are legal.<br>
+ return Op;<br>
+ }<br>
+<br>
+ // Legalize to a VMULL instruction.<br>
+ DebugLoc DL = Op.getDebugLoc();<br>
+ SDValue Op0 = SkipExtension(N0, DAG);<br>
+ SDValue Op1 = SkipExtension(N1, DAG);<br>
+<br>
+ assert(Op0.getValueType().is64BitVector() &&<br>
+ Op1.getValueType().is64BitVector() &&<br>
+ "unexpected types for extended operands to VMULL");<br>
+ return DAG.getNode(NewOpc, DL, VT, Op0, Op1);<br>
+}<br>
+<br>
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {<br>
switch (Op.getOpcode()) {<br>
default: llvm_unreachable("Don't know how to custom lower this!");<br>
@@ -3792,6 +3842,7 @@<br>
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);<br>
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);<br>
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);<br>
+ case ISD::MUL: return LowerMUL(Op, DAG);<br>
}<br>
return SDValue();<br>
}<br>
<br>
Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.h<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.h?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.h?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.h (original)<br>
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h Wed Sep 1 18:50:19 2010<br>
@@ -145,6 +145,10 @@<br>
VUZP, // unzip (deinterleave)<br>
VTRN, // transpose<br>
<br>
+ // Vector multiply long:<br>
+ VMULLs, // ...signed<br>
+ VMULLu, // ...unsigned<br>
+<br>
// Operands of the standard BUILD_VECTOR node are not legalized, which<br>
// is fine if BUILD_VECTORs are always lowered to shuffles or other<br>
// operations, but for ARM some BUILD_VECTORs are legal as-is and their<br>
<br>
Modified: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrNEON.td?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrNEON.td?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td (original)<br>
+++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td Wed Sep 1 18:50:19 2010<br>
@@ -93,6 +93,11 @@<br>
def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;<br>
def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;<br>
<br>
+def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,<br>
+ SDTCisSameAs<1, 2>]>;<br>
+def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;<br>
+def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;<br>
+<br>
def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,<br>
SDTCisSameAs<0, 2>]>;<br>
def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>;<br>
@@ -1255,6 +1260,42 @@<br>
[(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),<br>
(OpTy QPR:$src2), (OpTy QPR:$src3))))]>;<br>
<br>
+// Long Multiply-Add/Sub operations.<br>
+class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,<br>
+ InstrItinClass itin, string OpcodeStr, string Dt,<br>
+ ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode><br>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,<br>
+ (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,<br>
+ OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",<br>
+ [(set QPR:$dst, (OpNode (TyQ QPR:$src1),<br>
+ (TyQ (MulOp (TyD DPR:$src2),<br>
+ (TyD DPR:$src3)))))]>;<br>
+class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,<br>
+ InstrItinClass itin, string OpcodeStr, string Dt,<br>
+ ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode><br>
+ : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst),<br>
+ (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),<br>
+ NVMulSLFrm, itin,<br>
+ OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",<br>
+ [(set QPR:$dst,<br>
+ (OpNode (TyQ QPR:$src1),<br>
+ (TyQ (MulOp (TyD DPR:$src2),<br>
+ (TyD (NEONvduplane (TyD DPR_VFP2:$src3),<br>
+ imm:$lane))))))]>;<br>
+class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,<br>
+ InstrItinClass itin, string OpcodeStr, string Dt,<br>
+ ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode><br>
+ : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst),<br>
+ (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),<br>
+ NVMulSLFrm, itin,<br>
+ OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",<br>
+ [(set QPR:$dst,<br>
+ (OpNode (TyQ QPR:$src1),<br>
+ (TyQ (MulOp (TyD DPR:$src2),<br>
+ (TyD (NEONvduplane (TyD DPR_8:$src3),<br>
+ imm:$lane))))))]>;<br>
+<br>
+<br>
// Neon Long 3-argument intrinsic. The destination register is<br>
// a quad-register and is also used as the first source operand register.<br>
class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,<br>
@@ -1306,8 +1347,37 @@<br>
// Long 3-register operations.<br>
class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,<br>
InstrItinClass itin, string OpcodeStr, string Dt,<br>
- ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,<br>
- bit Commutable><br>
+ ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable><br>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,<br>
+ (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,<br>
+ OpcodeStr, Dt, "$dst, $src1, $src2", "",<br>
+ [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src1), (TyD DPR:$src2))))]> {<br>
+ let isCommutable = Commutable;<br>
+}<br>
+class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,<br>
+ InstrItinClass itin, string OpcodeStr, string Dt,<br>
+ ValueType TyQ, ValueType TyD, SDNode OpNode><br>
+ : N3V<op24, 1, op21_20, op11_8, 1, 0,<br>
+ (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),<br>
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",<br>
+ [(set QPR:$dst,<br>
+ (TyQ (OpNode (TyD DPR:$src1),<br>
+ (TyD (NEONvduplane (TyD DPR_VFP2:$src2),imm:$lane)))))]>;<br>
+class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,<br>
+ InstrItinClass itin, string OpcodeStr, string Dt,<br>
+ ValueType TyQ, ValueType TyD, SDNode OpNode><br>
+ : N3V<op24, 1, op21_20, op11_8, 1, 0,<br>
+ (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),<br>
+ NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",<br>
+ [(set QPR:$dst,<br>
+ (TyQ (OpNode (TyD DPR:$src1),<br>
+ (TyD (NEONvduplane (TyD DPR_8:$src2), imm:$lane)))))]>;<br>
+<br>
+// Long 3-register operations with explicitly extended operands.<br>
+class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,<br>
+ InstrItinClass itin, string OpcodeStr, string Dt,<br>
+ ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,<br>
+ bit Commutable><br>
: N3V<op24, op23, op21_20, op11_8, 0, op4,<br>
(outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,<br>
OpcodeStr, Dt, "$dst, $src1, $src2", "",<br>
@@ -1729,16 +1799,40 @@<br>
multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,<br>
InstrItinClass itin16, InstrItinClass itin32,<br>
string OpcodeStr, string Dt,<br>
- SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {<br>
+ SDNode OpNode, bit Commutable = 0> {<br>
+ def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,<br>
+ OpcodeStr, !strconcat(Dt, "8"),<br>
+ v8i16, v8i8, OpNode, Commutable>;<br>
def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,<br>
OpcodeStr, !strconcat(Dt, "16"),<br>
- v4i32, v4i16, OpNode, ExtOp, Commutable>;<br>
+ v4i32, v4i16, OpNode, Commutable>;<br>
def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,<br>
OpcodeStr, !strconcat(Dt, "32"),<br>
- v2i64, v2i32, OpNode, ExtOp, Commutable>;<br>
- def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,<br>
- OpcodeStr, !strconcat(Dt, "8"),<br>
- v8i16, v8i8, OpNode, ExtOp, Commutable>;<br>
+ v2i64, v2i32, OpNode, Commutable>;<br>
+}<br>
+<br>
+multiclass N3VLSL_HS<bit op24, bits<4> op11_8,<br>
+ InstrItinClass itin, string OpcodeStr, string Dt,<br>
+ SDNode OpNode> {<br>
+ def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr,<br>
+ !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;<br>
+ def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr,<br>
+ !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;<br>
+}<br>
+<br>
+multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,<br>
+ InstrItinClass itin16, InstrItinClass itin32,<br>
+ string OpcodeStr, string Dt,<br>
+ SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {<br>
+ def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,<br>
+ OpcodeStr, !strconcat(Dt, "8"),<br>
+ v8i16, v8i8, OpNode, ExtOp, Commutable>;<br>
+ def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16,<br>
+ OpcodeStr, !strconcat(Dt, "16"),<br>
+ v4i32, v4i16, OpNode, ExtOp, Commutable>;<br>
+ def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32,<br>
+ OpcodeStr, !strconcat(Dt, "32"),<br>
+ v2i64, v2i32, OpNode, ExtOp, Commutable>;<br>
}<br>
<br>
// Neon Long 3-register vector intrinsics.<br>
@@ -1857,6 +1951,29 @@<br>
}<br>
<br>
<br>
+// Neon Long Multiply-Op vector operations,<br>
+// element sizes of 8, 16 and 32 bits:<br>
+multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,<br>
+ InstrItinClass itin16, InstrItinClass itin32,<br>
+ string OpcodeStr, string Dt, SDNode MulOp,<br>
+ SDNode OpNode> {<br>
+ def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr,<br>
+ !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>;<br>
+ def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr,<br>
+ !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>;<br>
+ def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr,<br>
+ !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;<br>
+}<br>
+<br>
+multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,<br>
+ string Dt, SDNode MulOp, SDNode OpNode> {<br>
+ def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr,<br>
+ !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>;<br>
+ def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr,<br>
+ !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;<br>
+}<br>
+<br>
+<br>
// Neon Long 3-argument intrinsics.<br>
<br>
// First with only element sizes of 16 and 32 bits:<br>
@@ -2130,10 +2247,10 @@<br>
def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",<br>
v4f32, v4f32, fadd, 1>;<br>
// VADDL : Vector Add Long (Q = D + D)<br>
-defm VADDLs : N3VL_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,<br>
- "vaddl", "s", add, sext, 1>;<br>
-defm VADDLu : N3VL_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,<br>
- "vaddl", "u", add, zext, 1>;<br>
+defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,<br>
+ "vaddl", "s", add, sext, 1>;<br>
+defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,<br>
+ "vaddl", "u", add, zext, 1>;<br>
// VADDW : Vector Add Wide (Q = Q + D)<br>
defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;<br>
defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;<br>
@@ -2247,16 +2364,14 @@<br>
(SubReg_i32_lane imm:$lane)))>;<br>
<br>
// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D)<br>
-defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,<br>
- "vmull", "s", int_arm_neon_vmulls, 1>;<br>
-defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,<br>
- "vmull", "u", int_arm_neon_vmullu, 1>;<br>
+defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,<br>
+ "vmull", "s", NEONvmulls, 1>;<br>
+defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,<br>
+ "vmull", "u", NEONvmullu, 1>;<br>
def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",<br>
v8i16, v8i8, int_arm_neon_vmullp, 1>;<br>
-defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s",<br>
- int_arm_neon_vmulls>;<br>
-defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u",<br>
- int_arm_neon_vmullu>;<br>
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;<br>
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;<br>
<br>
// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)<br>
defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,<br>
@@ -2306,13 +2421,13 @@<br>
(SubReg_i32_lane imm:$lane)))>;<br>
<br>
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)<br>
-defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,<br>
- "vmlal", "s", int_arm_neon_vmlals>;<br>
-defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,<br>
- "vmlal", "u", int_arm_neon_vmlalu>;<br>
+defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,<br>
+ "vmlal", "s", NEONvmulls, add>;<br>
+defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,<br>
+ "vmlal", "u", NEONvmullu, add>;<br>
<br>
-defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>;<br>
-defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>;<br>
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;<br>
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;<br>
<br>
// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)<br>
defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,<br>
@@ -2358,13 +2473,13 @@<br>
(SubReg_i32_lane imm:$lane)))>;<br>
<br>
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)<br>
-defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,<br>
- "vmlsl", "s", int_arm_neon_vmlsls>;<br>
-defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,<br>
- "vmlsl", "u", int_arm_neon_vmlslu>;<br>
+defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,<br>
+ "vmlsl", "s", NEONvmulls, sub>;<br>
+defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,<br>
+ "vmlsl", "u", NEONvmullu, sub>;<br>
<br>
-defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>;<br>
-defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>;<br>
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;<br>
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;<br>
<br>
// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)<br>
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,<br>
@@ -2381,10 +2496,10 @@<br>
def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",<br>
v4f32, v4f32, fsub, 0>;<br>
// VSUBL : Vector Subtract Long (Q = D - D)<br>
-defm VSUBLs : N3VL_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,<br>
- "vsubl", "s", sub, sext, 0>;<br>
-defm VSUBLu : N3VL_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,<br>
- "vsubl", "u", sub, zext, 0>;<br>
+defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,<br>
+ "vsubl", "s", sub, sext, 0>;<br>
+defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,<br>
+ "vsubl", "u", sub, zext, 0>;<br>
// VSUBW : Vector Subtract Wide (Q = Q - D)<br>
defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;<br>
defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;<br>
<br>
Modified: llvm/trunk/lib/VMCore/AutoUpgrade.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/VMCore/AutoUpgrade.cpp?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/VMCore/AutoUpgrade.cpp?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/VMCore/AutoUpgrade.cpp (original)<br>
+++ llvm/trunk/lib/VMCore/AutoUpgrade.cpp Wed Sep 1 18:50:19 2010<br>
@@ -90,6 +90,12 @@<br>
(Name.compare(19, 2, "s.", 2) == 0 ||<br>
Name.compare(19, 2, "u.", 2) == 0)) ||<br>
<br>
+ ((Name.compare(14, 5, "vmull", 5) == 0 ||<br>
+ Name.compare(14, 5, "vmlal", 5) == 0 ||<br>
+ Name.compare(14, 5, "vmlsl", 5) == 0) &&<br>
+ (Name.compare(19, 2, "s.", 2) == 0 ||<br>
+ Name.compare(19, 2, "u.", 2) == 0)) ||<br>
+<br>
(Name.compare(14, 6, "vmovn.", 6) == 0)) {<br>
<br>
// Calls to these are transformed into IR without intrinsics.<br>
@@ -359,6 +365,32 @@<br>
return Upgraded;<br>
}<br>
<br>
+/// ExtendNEONArgs - For NEON "long" and "wide" operations, where the results<br>
+/// have vector elements twice as big as one or both source operands, do the<br>
+/// sign- or zero-extension that used to be handled by intrinsics. The<br>
+/// extended values are returned via V0 and V1.<br>
+static void ExtendNEONArgs(CallInst *CI, Value *Arg0, Value *Arg1,<br>
+ Value *&V0, Value *&V1) {<br>
+ Function *F = CI->getCalledFunction();<br>
+ const std::string& Name = F->getName();<br>
+ bool isLong = (Name.at(18) == 'l');<br>
+ bool isSigned = (Name.at(19) == 's');<br>
+<br>
+ if (isSigned) {<br>
+ if (isLong)<br>
+ V0 = new SExtInst(Arg0, CI->getType(), "", CI);<br>
+ else<br>
+ V0 = Arg0;<br>
+ V1 = new SExtInst(Arg1, CI->getType(), "", CI);<br>
+ } else {<br>
+ if (isLong)<br>
+ V0 = new ZExtInst(Arg0, CI->getType(), "", CI);<br>
+ else<br>
+ V0 = Arg0;<br>
+ V1 = new ZExtInst(Arg1, CI->getType(), "", CI);<br>
+ }<br>
+}<br>
+<br>
// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the<br>
// upgraded intrinsic. All argument and return casting must be provided in<br>
// order to seamlessly integrate with existing context.<br>
@@ -376,33 +408,32 @@<br>
// Upgrade ARM NEON intrinsics.<br>
if (Name.compare(5, 9, "arm.neon.", 9) == 0) {<br>
Instruction *NewI;<br>
+ Value *V0, *V1;<br>
if (Name.compare(14, 7, "vmovls.", 7) == 0) {<br>
NewI = new SExtInst(CI->getArgOperand(0), CI->getType(),<br>
"upgraded." + CI->getName(), CI);<br>
} else if (Name.compare(14, 7, "vmovlu.", 7) == 0) {<br>
NewI = new ZExtInst(CI->getArgOperand(0), CI->getType(),<br>
"upgraded." + CI->getName(), CI);<br>
-<br>
- } else if (Name.compare(14, 4, "vadd", 4) == 0 ||<br>
- Name.compare(14, 4, "vsub", 4) == 0) {<br>
- // Extend one (vaddw/vsubw) or both (vaddl/vsubl) operands.<br>
- Value *V0 = CI->getArgOperand(0);<br>
- Value *V1 = CI->getArgOperand(1);<br>
- if (Name.at(19) == 's') {<br>
- if (Name.at(18) == 'l')<br>
- V0 = new SExtInst(CI->getArgOperand(0), CI->getType(), "", CI);<br>
- V1 = new SExtInst(CI->getArgOperand(1), CI->getType(), "", CI);<br>
- } else {<br>
- assert(Name.at(19) == 'u' && "unexpected vadd/vsub intrinsic");<br>
- if (Name.at(18) == 'l')<br>
- V0 = new ZExtInst(CI->getArgOperand(0), CI->getType(), "", CI);<br>
- V1 = new ZExtInst(CI->getArgOperand(1), CI->getType(), "", CI);<br>
- }<br>
- if (Name.compare(14, 4, "vadd", 4) == 0)<br>
- NewI = BinaryOperator::CreateAdd(V0, V1,"upgraded."+CI->getName(),CI);<br>
- else<br>
- NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);<br>
-<br>
+ } else if (Name.compare(14, 4, "vadd", 4) == 0) {<br>
+ ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);<br>
+ NewI = BinaryOperator::CreateAdd(V0, V1, "upgraded."+CI->getName(), CI);<br>
+ } else if (Name.compare(14, 4, "vsub", 4) == 0) {<br>
+ ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);<br>
+ NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);<br>
+ } else if (Name.compare(14, 4, "vmul", 4) == 0) {<br>
+ ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);<br>
+ NewI = BinaryOperator::CreateMul(V0, V1,"upgraded."+CI->getName(),CI);<br>
+ } else if (Name.compare(14, 4, "vmla", 4) == 0) {<br>
+ ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);<br>
+ Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);<br>
+ NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), MulI,<br>
+ "upgraded."+CI->getName(), CI);<br>
+ } else if (Name.compare(14, 4, "vmls", 4) == 0) {<br>
+ ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);<br>
+ Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);<br>
+ NewI = BinaryOperator::CreateSub(CI->getArgOperand(0), MulI,<br>
+ "upgraded."+CI->getName(), CI);<br>
} else if (Name.compare(14, 6, "vmovn.", 6) == 0) {<br>
NewI = new TruncInst(CI->getArgOperand(0), CI->getType(),<br>
"upgraded." + CI->getName(), CI);<br>
<br>
Modified: llvm/trunk/test/Bitcode/neon-intrinsics.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Bitcode/neon-intrinsics.ll?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Bitcode/neon-intrinsics.ll?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/Bitcode/neon-intrinsics.ll (original)<br>
+++ llvm/trunk/test/Bitcode/neon-intrinsics.ll Wed Sep 1 18:50:19 2010<br>
@@ -52,7 +52,7 @@<br>
; CHECK: zext <4 x i16><br>
; CHECK-NEXT: add <4 x i32><br>
<br>
-; vsubl/vsubw should be auto-upgraded to sub with sext/zext<br>
+; vsubl/vsubw should be auto-upgraded to subtract with sext/zext<br>
<br>
; CHECK: vsubls16<br>
; CHECK-NOT: arm.neon.vsubls.v4i32<br>
@@ -76,6 +76,56 @@<br>
; CHECK: zext <4 x i16><br>
; CHECK-NEXT: sub <4 x i32><br>
<br>
+; vmull should be auto-upgraded to multiply with sext/zext<br>
+; (but vmullp should remain an intrinsic)<br>
+<br>
+; CHECK: vmulls8<br>
+; CHECK-NOT: arm.neon.vmulls.v8i16<br>
+; CHECK: sext <8 x i8><br>
+; CHECK-NEXT: sext <8 x i8><br>
+; CHECK-NEXT: mul <8 x i16><br>
+<br>
+; CHECK: vmullu16<br>
+; CHECK-NOT: arm.neon.vmullu.v4i32<br>
+; CHECK: zext <4 x i16><br>
+; CHECK-NEXT: zext <4 x i16><br>
+; CHECK-NEXT: mul <4 x i32><br>
+<br>
+; CHECK: vmullp8<br>
+; CHECK: arm.neon.vmullp.v8i16<br>
+<br>
+; vmlal should be auto-upgraded to multiply/add with sext/zext<br>
+<br>
+; CHECK: vmlals32<br>
+; CHECK-NOT: arm.neon.vmlals.v2i64<br>
+; CHECK: sext <2 x i32><br>
+; CHECK-NEXT: sext <2 x i32><br>
+; CHECK-NEXT: mul <2 x i64><br>
+; CHECK-NEXT: add <2 x i64><br>
+<br>
+; CHECK: vmlalu8<br>
+; CHECK-NOT: arm.neon.vmlalu.v8i16<br>
+; CHECK: zext <8 x i8><br>
+; CHECK-NEXT: zext <8 x i8><br>
+; CHECK-NEXT: mul <8 x i16><br>
+; CHECK-NEXT: add <8 x i16><br>
+<br>
+; vmlsl should be auto-upgraded to multiply/sub with sext/zext<br>
+<br>
+; CHECK: vmlsls16<br>
+; CHECK-NOT: arm.neon.vmlsls.v4i32<br>
+; CHECK: sext <4 x i16><br>
+; CHECK-NEXT: sext <4 x i16><br>
+; CHECK-NEXT: mul <4 x i32><br>
+; CHECK-NEXT: sub <4 x i32><br>
+<br>
+; CHECK: vmlslu32<br>
+; CHECK-NOT: arm.neon.vmlslu.v2i64<br>
+; CHECK: zext <2 x i32><br>
+; CHECK-NEXT: zext <2 x i32><br>
+; CHECK-NEXT: mul <2 x i64><br>
+; CHECK-NEXT: sub <2 x i64><br>
+<br>
; vmovn should be auto-upgraded to trunc<br>
<br>
; CHECK: vmovni16<br>
<br>
Modified: llvm/trunk/test/Bitcode/neon-intrinsics.ll.bc<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Bitcode/neon-intrinsics.ll.bc?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Bitcode/neon-intrinsics.ll.bc?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
Binary files - no diff available.<br>
<br>
Modified: llvm/trunk/test/CodeGen/ARM/vmla.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmla.ll?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmla.ll?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/CodeGen/ARM/vmla.ll (original)<br>
+++ llvm/trunk/test/CodeGen/ARM/vmla.ll Wed Sep 1 18:50:19 2010<br>
@@ -94,8 +94,11 @@<br>
%tmp1 = load <8 x i16>* %A<br>
%tmp2 = load <8 x i8>* %B<br>
%tmp3 = load <8 x i8>* %C<br>
- %tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)<br>
- ret <8 x i16> %tmp4<br>
+ %tmp4 = sext <8 x i8> %tmp2 to <8 x i16><br>
+ %tmp5 = sext <8 x i8> %tmp3 to <8 x i16><br>
+ %tmp6 = mul <8 x i16> %tmp4, %tmp5<br>
+ %tmp7 = add <8 x i16> %tmp1, %tmp6<br>
+ ret <8 x i16> %tmp7<br>
}<br>
<br>
define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {<br>
@@ -104,8 +107,11 @@<br>
%tmp1 = load <4 x i32>* %A<br>
%tmp2 = load <4 x i16>* %B<br>
%tmp3 = load <4 x i16>* %C<br>
- %tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)<br>
- ret <4 x i32> %tmp4<br>
+ %tmp4 = sext <4 x i16> %tmp2 to <4 x i32><br>
+ %tmp5 = sext <4 x i16> %tmp3 to <4 x i32><br>
+ %tmp6 = mul <4 x i32> %tmp4, %tmp5<br>
+ %tmp7 = add <4 x i32> %tmp1, %tmp6<br>
+ ret <4 x i32> %tmp7<br>
}<br>
<br>
define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {<br>
@@ -114,8 +120,11 @@<br>
%tmp1 = load <2 x i64>* %A<br>
%tmp2 = load <2 x i32>* %B<br>
%tmp3 = load <2 x i32>* %C<br>
- %tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)<br>
- ret <2 x i64> %tmp4<br>
+ %tmp4 = sext <2 x i32> %tmp2 to <2 x i64><br>
+ %tmp5 = sext <2 x i32> %tmp3 to <2 x i64><br>
+ %tmp6 = mul <2 x i64> %tmp4, %tmp5<br>
+ %tmp7 = add <2 x i64> %tmp1, %tmp6<br>
+ ret <2 x i64> %tmp7<br>
}<br>
<br>
define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {<br>
@@ -124,8 +133,11 @@<br>
%tmp1 = load <8 x i16>* %A<br>
%tmp2 = load <8 x i8>* %B<br>
%tmp3 = load <8 x i8>* %C<br>
- %tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)<br>
- ret <8 x i16> %tmp4<br>
+ %tmp4 = zext <8 x i8> %tmp2 to <8 x i16><br>
+ %tmp5 = zext <8 x i8> %tmp3 to <8 x i16><br>
+ %tmp6 = mul <8 x i16> %tmp4, %tmp5<br>
+ %tmp7 = add <8 x i16> %tmp1, %tmp6<br>
+ ret <8 x i16> %tmp7<br>
}<br>
<br>
define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {<br>
@@ -134,8 +146,11 @@<br>
%tmp1 = load <4 x i32>* %A<br>
%tmp2 = load <4 x i16>* %B<br>
%tmp3 = load <4 x i16>* %C<br>
- %tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)<br>
- ret <4 x i32> %tmp4<br>
+ %tmp4 = zext <4 x i16> %tmp2 to <4 x i32><br>
+ %tmp5 = zext <4 x i16> %tmp3 to <4 x i32><br>
+ %tmp6 = mul <4 x i32> %tmp4, %tmp5<br>
+ %tmp7 = add <4 x i32> %tmp1, %tmp6<br>
+ ret <4 x i32> %tmp7<br>
}<br>
<br>
define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {<br>
@@ -144,8 +159,11 @@<br>
%tmp1 = load <2 x i64>* %A<br>
%tmp2 = load <2 x i32>* %B<br>
%tmp3 = load <2 x i32>* %C<br>
- %tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)<br>
- ret <2 x i64> %tmp4<br>
+ %tmp4 = zext <2 x i32> %tmp2 to <2 x i64><br>
+ %tmp5 = zext <2 x i32> %tmp3 to <2 x i64><br>
+ %tmp6 = mul <2 x i64> %tmp4, %tmp5<br>
+ %tmp7 = add <2 x i64> %tmp1, %tmp6<br>
+ ret <2 x i64> %tmp7<br>
}<br>
<br>
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {<br>
@@ -153,8 +171,11 @@<br>
; CHECK: test_vmlal_lanes16<br>
; CHECK: vmlal.s16 q0, d2, d3[1]<br>
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]<br>
- %1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]<br>
- ret <4 x i32> %1<br>
+ %1 = sext <4 x i16> %arg1_int16x4_t to <4 x i32><br>
+ %2 = sext <4 x i16> %0 to <4 x i32><br>
+ %3 = mul <4 x i32> %1, %2<br>
+ %4 = add <4 x i32> %arg0_int32x4_t, %3<br>
+ ret <4 x i32> %4<br>
}<br>
<br>
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {<br>
@@ -162,8 +183,11 @@<br>
; CHECK: test_vmlal_lanes32<br>
; CHECK: vmlal.s32 q0, d2, d3[1]<br>
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]<br>
- %1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]<br>
- ret <2 x i64> %1<br>
+ %1 = sext <2 x i32> %arg1_int32x2_t to <2 x i64><br>
+ %2 = sext <2 x i32> %0 to <2 x i64><br>
+ %3 = mul <2 x i64> %1, %2<br>
+ %4 = add <2 x i64> %arg0_int64x2_t, %3<br>
+ ret <2 x i64> %4<br>
}<br>
<br>
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {<br>
@@ -171,8 +195,11 @@<br>
; CHECK: test_vmlal_laneu16<br>
; CHECK: vmlal.u16 q0, d2, d3[1]<br>
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]<br>
- %1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]<br>
- ret <4 x i32> %1<br>
+ %1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32><br>
+ %2 = zext <4 x i16> %0 to <4 x i32><br>
+ %3 = mul <4 x i32> %1, %2<br>
+ %4 = add <4 x i32> %arg0_uint32x4_t, %3<br>
+ ret <4 x i32> %4<br>
}<br>
<br>
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {<br>
@@ -180,14 +207,9 @@<br>
; CHECK: test_vmlal_laneu32<br>
; CHECK: vmlal.u32 q0, d2, d3[1]<br>
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]<br>
- %1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]<br>
- ret <2 x i64> %1<br>
+ %1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64><br>
+ %2 = zext <2 x i32> %0 to <2 x i64><br>
+ %3 = mul <2 x i64> %1, %2<br>
+ %4 = add <2 x i64> %arg0_uint64x2_t, %3<br>
+ ret <2 x i64> %4<br>
}<br>
-<br>
-declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone<br>
-declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone<br>
-declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone<br>
-<br>
-declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone<br>
-declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone<br>
-declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone<br>
<br>
Modified: llvm/trunk/test/CodeGen/ARM/vmls.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmls.ll?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmls.ll?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/CodeGen/ARM/vmls.ll (original)<br>
+++ llvm/trunk/test/CodeGen/ARM/vmls.ll Wed Sep 1 18:50:19 2010<br>
@@ -94,8 +94,11 @@<br>
%tmp1 = load <8 x i16>* %A<br>
%tmp2 = load <8 x i8>* %B<br>
%tmp3 = load <8 x i8>* %C<br>
- %tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)<br>
- ret <8 x i16> %tmp4<br>
+ %tmp4 = sext <8 x i8> %tmp2 to <8 x i16><br>
+ %tmp5 = sext <8 x i8> %tmp3 to <8 x i16><br>
+ %tmp6 = mul <8 x i16> %tmp4, %tmp5<br>
+ %tmp7 = sub <8 x i16> %tmp1, %tmp6<br>
+ ret <8 x i16> %tmp7<br>
}<br>
<br>
define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {<br>
@@ -104,8 +107,11 @@<br>
%tmp1 = load <4 x i32>* %A<br>
%tmp2 = load <4 x i16>* %B<br>
%tmp3 = load <4 x i16>* %C<br>
- %tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)<br>
- ret <4 x i32> %tmp4<br>
+ %tmp4 = sext <4 x i16> %tmp2 to <4 x i32><br>
+ %tmp5 = sext <4 x i16> %tmp3 to <4 x i32><br>
+ %tmp6 = mul <4 x i32> %tmp4, %tmp5<br>
+ %tmp7 = sub <4 x i32> %tmp1, %tmp6<br>
+ ret <4 x i32> %tmp7<br>
}<br>
<br>
define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {<br>
@@ -114,8 +120,11 @@<br>
%tmp1 = load <2 x i64>* %A<br>
%tmp2 = load <2 x i32>* %B<br>
%tmp3 = load <2 x i32>* %C<br>
- %tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)<br>
- ret <2 x i64> %tmp4<br>
+ %tmp4 = sext <2 x i32> %tmp2 to <2 x i64><br>
+ %tmp5 = sext <2 x i32> %tmp3 to <2 x i64><br>
+ %tmp6 = mul <2 x i64> %tmp4, %tmp5<br>
+ %tmp7 = sub <2 x i64> %tmp1, %tmp6<br>
+ ret <2 x i64> %tmp7<br>
}<br>
<br>
define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {<br>
@@ -124,8 +133,11 @@<br>
%tmp1 = load <8 x i16>* %A<br>
%tmp2 = load <8 x i8>* %B<br>
%tmp3 = load <8 x i8>* %C<br>
- %tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)<br>
- ret <8 x i16> %tmp4<br>
+ %tmp4 = zext <8 x i8> %tmp2 to <8 x i16><br>
+ %tmp5 = zext <8 x i8> %tmp3 to <8 x i16><br>
+ %tmp6 = mul <8 x i16> %tmp4, %tmp5<br>
+ %tmp7 = sub <8 x i16> %tmp1, %tmp6<br>
+ ret <8 x i16> %tmp7<br>
}<br>
<br>
define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {<br>
@@ -134,8 +146,11 @@<br>
%tmp1 = load <4 x i32>* %A<br>
%tmp2 = load <4 x i16>* %B<br>
%tmp3 = load <4 x i16>* %C<br>
- %tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)<br>
- ret <4 x i32> %tmp4<br>
+ %tmp4 = zext <4 x i16> %tmp2 to <4 x i32><br>
+ %tmp5 = zext <4 x i16> %tmp3 to <4 x i32><br>
+ %tmp6 = mul <4 x i32> %tmp4, %tmp5<br>
+ %tmp7 = sub <4 x i32> %tmp1, %tmp6<br>
+ ret <4 x i32> %tmp7<br>
}<br>
<br>
define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {<br>
@@ -144,8 +159,11 @@<br>
%tmp1 = load <2 x i64>* %A<br>
%tmp2 = load <2 x i32>* %B<br>
%tmp3 = load <2 x i32>* %C<br>
- %tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)<br>
- ret <2 x i64> %tmp4<br>
+ %tmp4 = zext <2 x i32> %tmp2 to <2 x i64><br>
+ %tmp5 = zext <2 x i32> %tmp3 to <2 x i64><br>
+ %tmp6 = mul <2 x i64> %tmp4, %tmp5<br>
+ %tmp7 = sub <2 x i64> %tmp1, %tmp6<br>
+ ret <2 x i64> %tmp7<br>
}<br>
<br>
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {<br>
@@ -153,8 +171,11 @@<br>
; CHECK: test_vmlsl_lanes16<br>
; CHECK: vmlsl.s16 q0, d2, d3[1]<br>
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]<br>
- %1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]<br>
- ret <4 x i32> %1<br>
+ %1 = sext <4 x i16> %arg1_int16x4_t to <4 x i32><br>
+ %2 = sext <4 x i16> %0 to <4 x i32><br>
+ %3 = mul <4 x i32> %1, %2<br>
+ %4 = sub <4 x i32> %arg0_int32x4_t, %3<br>
+ ret <4 x i32> %4<br>
}<br>
<br>
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {<br>
@@ -162,8 +183,11 @@<br>
; CHECK: test_vmlsl_lanes32<br>
; CHECK: vmlsl.s32 q0, d2, d3[1]<br>
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]<br>
- %1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]<br>
- ret <2 x i64> %1<br>
+ %1 = sext <2 x i32> %arg1_int32x2_t to <2 x i64><br>
+ %2 = sext <2 x i32> %0 to <2 x i64><br>
+ %3 = mul <2 x i64> %1, %2<br>
+ %4 = sub <2 x i64> %arg0_int64x2_t, %3<br>
+ ret <2 x i64> %4<br>
}<br>
<br>
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {<br>
@@ -171,8 +195,11 @@<br>
; CHECK: test_vmlsl_laneu16<br>
; CHECK: vmlsl.u16 q0, d2, d3[1]<br>
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]<br>
- %1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]<br>
- ret <4 x i32> %1<br>
+ %1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32><br>
+ %2 = zext <4 x i16> %0 to <4 x i32><br>
+ %3 = mul <4 x i32> %1, %2<br>
+ %4 = sub <4 x i32> %arg0_uint32x4_t, %3<br>
+ ret <4 x i32> %4<br>
}<br>
<br>
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {<br>
@@ -180,14 +207,9 @@<br>
; CHECK: test_vmlsl_laneu32<br>
; CHECK: vmlsl.u32 q0, d2, d3[1]<br>
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]<br>
- %1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]<br>
- ret <2 x i64> %1<br>
+ %1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64><br>
+ %2 = zext <2 x i32> %0 to <2 x i64><br>
+ %3 = mul <2 x i64> %1, %2<br>
+ %4 = sub <2 x i64> %arg0_uint64x2_t, %3<br>
+ ret <2 x i64> %4<br>
}<br>
-<br>
-declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone<br>
-declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone<br>
-declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone<br>
-<br>
-declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone<br>
-declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone<br>
-declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone<br>
<br>
Modified: llvm/trunk/test/CodeGen/ARM/vmul.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmul.ll?rev=112773&r1=112772&r2=112773&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmul.ll?rev=112773&r1=112772&r2=112773&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/CodeGen/ARM/vmul.ll (original)<br>
+++ llvm/trunk/test/CodeGen/ARM/vmul.ll Wed Sep 1 18:50:19 2010<br>
@@ -152,8 +152,10 @@<br>
;CHECK: vmull.s8<br>
%tmp1 = load <8 x i8>* %A<br>
%tmp2 = load <8 x i8>* %B<br>
- %tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)<br>
- ret <8 x i16> %tmp3<br>
+ %tmp3 = sext <8 x i8> %tmp1 to <8 x i16><br>
+ %tmp4 = sext <8 x i8> %tmp2 to <8 x i16><br>
+ %tmp5 = mul <8 x i16> %tmp3, %tmp4<br>
+ ret <8 x i16> %tmp5<br>
}<br>
<br>
define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {<br>
@@ -161,8 +163,10 @@<br>
;CHECK: vmull.s16<br>
%tmp1 = load <4 x i16>* %A<br>
%tmp2 = load <4 x i16>* %B<br>
- %tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)<br>
- ret <4 x i32> %tmp3<br>
+ %tmp3 = sext <4 x i16> %tmp1 to <4 x i32><br>
+ %tmp4 = sext <4 x i16> %tmp2 to <4 x i32><br>
+ %tmp5 = mul <4 x i32> %tmp3, %tmp4<br>
+ ret <4 x i32> %tmp5<br>
}<br>
<br>
define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {<br>
@@ -170,8 +174,10 @@<br>
;CHECK: vmull.s32<br>
%tmp1 = load <2 x i32>* %A<br>
%tmp2 = load <2 x i32>* %B<br>
- %tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)<br>
- ret <2 x i64> %tmp3<br>
+ %tmp3 = sext <2 x i32> %tmp1 to <2 x i64><br>
+ %tmp4 = sext <2 x i32> %tmp2 to <2 x i64><br>
+ %tmp5 = mul <2 x i64> %tmp3, %tmp4<br>
+ ret <2 x i64> %tmp5<br>
}<br>
<br>
define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {<br>
@@ -179,8 +185,10 @@<br>
;CHECK: vmull.u8<br>
%tmp1 = load <8 x i8>* %A<br>
%tmp2 = load <8 x i8>* %B<br>
- %tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)<br>
- ret <8 x i16> %tmp3<br>
+ %tmp3 = zext <8 x i8> %tmp1 to <8 x i16><br>
+ %tmp4 = zext <8 x i8> %tmp2 to <8 x i16><br>
+ %tmp5 = mul <8 x i16> %tmp3, %tmp4<br>
+ ret <8 x i16> %tmp5<br>
}<br>
<br>
define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {<br>
@@ -188,8 +196,10 @@<br>
;CHECK: vmull.u16<br>
%tmp1 = load <4 x i16>* %A<br>
%tmp2 = load <4 x i16>* %B<br>
- %tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)<br>
- ret <4 x i32> %tmp3<br>
+ %tmp3 = zext <4 x i16> %tmp1 to <4 x i32><br>
+ %tmp4 = zext <4 x i16> %tmp2 to <4 x i32><br>
+ %tmp5 = mul <4 x i32> %tmp3, %tmp4<br>
+ ret <4 x i32> %tmp5<br>
}<br>
<br>
define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {<br>
@@ -197,8 +207,10 @@<br>
;CHECK: vmull.u32<br>
%tmp1 = load <2 x i32>* %A<br>
%tmp2 = load <2 x i32>* %B<br>
- %tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)<br>
- ret <2 x i64> %tmp3<br>
+ %tmp3 = zext <2 x i32> %tmp1 to <2 x i64><br>
+ %tmp4 = zext <2 x i32> %tmp2 to <2 x i64><br>
+ %tmp5 = mul <2 x i64> %tmp3, %tmp4<br>
+ ret <2 x i64> %tmp5<br>
}<br>
<br>
define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {<br>
@@ -215,8 +227,10 @@<br>
; CHECK: test_vmull_lanes16<br>
; CHECK: vmull.s16 q0, d0, d1[1]<br>
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]<br>
- %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]<br>
- ret <4 x i32> %1<br>
+ %1 = sext <4 x i16> %arg0_int16x4_t to <4 x i32><br>
+ %2 = sext <4 x i16> %0 to <4 x i32><br>
+ %3 = mul <4 x i32> %1, %2<br>
+ ret <4 x i32> %3<br>
}<br>
<br>
define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {<br>
@@ -224,8 +238,10 @@<br>
; CHECK: test_vmull_lanes32<br>
; CHECK: vmull.s32 q0, d0, d1[1]<br>
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]<br>
- %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]<br>
- ret <2 x i64> %1<br>
+ %1 = sext <2 x i32> %arg0_int32x2_t to <2 x i64><br>
+ %2 = sext <2 x i32> %0 to <2 x i64><br>
+ %3 = mul <2 x i64> %1, %2<br>
+ ret <2 x i64> %3<br>
}<br>
<br>
define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {<br>
@@ -233,8 +249,10 @@<br>
; CHECK: test_vmull_laneu16<br>
; CHECK: vmull.u16 q0, d0, d1[1]<br>
%0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]<br>
- %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]<br>
- ret <4 x i32> %1<br>
+ %1 = zext <4 x i16> %arg0_uint16x4_t to <4 x i32><br>
+ %2 = zext <4 x i16> %0 to <4 x i32><br>
+ %3 = mul <4 x i32> %1, %2<br>
+ ret <4 x i32> %3<br>
}<br>
<br>
define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {<br>
@@ -242,16 +260,10 @@<br>
; CHECK: test_vmull_laneu32<br>
; CHECK: vmull.u32 q0, d0, d1[1]<br>
%0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]<br>
- %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]<br>
- ret <2 x i64> %1<br>
+ %1 = zext <2 x i32> %arg0_uint32x2_t to <2 x i64><br>
+ %2 = zext <2 x i32> %0 to <2 x i64><br>
+ %3 = mul <2 x i64> %1, %2<br>
+ ret <2 x i64> %3<br>
}<br>
<br>
-declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone<br>
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone<br>
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone<br>
-<br>
-declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone<br>
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone<br>
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone<br>
-<br>
declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone<br>
<br>
<br>
_______________________________________________<br>
llvm-commits mailing list<br>
<a href="mailto:llvm-commits@cs.uiuc.edu">llvm-commits@cs.uiuc.edu</a><br>
<a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits" target="_blank">http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits</a><br>
</blockquote></div><br>