[llvm-commits] [llvm] r81220 - in /llvm/trunk: lib/Target/ARM/ARMBaseInstrInfo.cpp lib/Target/ARM/ARMInstrNEON.td lib/Target/ARM/ARMRegisterInfo.td test/CodeGen/ARM/vmlal_lane.ll test/CodeGen/ARM/vmlsl_lane.ll test/CodeGen/ARM/vmul_lane.ll test/CodeGen/ARM/vmull_lane.ll test/CodeGen/ARM/vqRdmulh_lane.ll test/CodeGen/ARM/vqdmlal_lanes.ll test/CodeGen/ARM/vqdmlsl_lanes.ll test/CodeGen/ARM/vqdmulh_lane.ll test/CodeGen/ARM/vqdmull_lane.ll

Tue Sep 8 11:37:39 PDT 2009

Hi Anton!

I belive this changeset caused about 230 CodeGen regression failures on
the llvm-arm-linux buildbot

compare the builds
http://google1.osuosl.org:8011/builders/llvm-arm-linux/builds/111   
and
http://google1.osuosl.org:8011/builders/llvm-arm-linux/builds/110

could you look into this?
cheers
Xerxes

Den 2009-09-08 17:22, Anton Korobeynikov skrev:
> Author: asl
> Date: Tue Sep  8 10:22:32 2009
> New Revision: 81220
>
> URL: http://llvm.org/viewvc/llvm-project?rev=81220&view=rev
> Log:
> Add NEON 'laned' operations. This fixes another bunch of gcc testsuite fails and
> makes the code faster.
>
> Added:
>     llvm/trunk/test/CodeGen/ARM/vmlal_lane.ll
>     llvm/trunk/test/CodeGen/ARM/vmlsl_lane.ll
>     llvm/trunk/test/CodeGen/ARM/vmul_lane.ll
>     llvm/trunk/test/CodeGen/ARM/vmull_lane.ll
>     llvm/trunk/test/CodeGen/ARM/vqRdmulh_lane.ll
>     llvm/trunk/test/CodeGen/ARM/vqdmlal_lanes.ll
>     llvm/trunk/test/CodeGen/ARM/vqdmlsl_lanes.ll
>     llvm/trunk/test/CodeGen/ARM/vqdmulh_lane.ll
>     llvm/trunk/test/CodeGen/ARM/vqdmull_lane.ll
> Modified:
>     llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
>     llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
>     llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td
>
> Modified: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp?rev=81220&r1=81219&r2=81220&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp (original)
> +++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp Tue Sep  8 10:22:32 2009
> @@ -612,14 +612,24 @@
>    if (I != MBB.end()) DL = I->getDebugLoc();
>  
>    if (DestRC != SrcRC) {
> -    if (((DestRC == ARM::DPRRegisterClass) &&
> -         (SrcRC == ARM::DPR_VFP2RegisterClass)) ||
> -        ((SrcRC == ARM::DPRRegisterClass) &&
> -         (DestRC == ARM::DPR_VFP2RegisterClass))) {
> -      // Allow copy between DPR and DPR_VFP2.
> -    } else {
> +    // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies
> +    if (DestRC == ARM::DPRRegisterClass) {
> +      if (SrcRC == ARM::DPR_VFP2RegisterClass ||
> +          SrcRC == ARM::DPR_8RegisterClass) {
> +      } else
> +        return false;
> +    } else if (DestRC == ARM::DPR_VFP2RegisterClass) {
> +      if (SrcRC == ARM::DPRRegisterClass ||
> +          SrcRC == ARM::DPR_8RegisterClass) {
> +      } else
> +        return false;
> +    } else if (DestRC == ARM::DPR_8RegisterClass) {
> +      if (SrcRC == ARM::DPRRegisterClass ||
> +          SrcRC == ARM::DPR_VFP2RegisterClass) {
> +      } else
> +        return false;
> +    } else
>        return false;
> -    }
>    }
>  
>    if (DestRC == ARM::GPRRegisterClass) {
> @@ -629,7 +639,8 @@
>      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg)
>                     .addReg(SrcReg));
>    } else if ((DestRC == ARM::DPRRegisterClass) ||
> -             (DestRC == ARM::DPR_VFP2RegisterClass)) {
> +             (DestRC == ARM::DPR_VFP2RegisterClass) ||
> +             (DestRC == ARM::DPR_8RegisterClass)) {
>      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
>                     .addReg(SrcReg));
>    } else if (DestRC == ARM::QPRRegisterClass) {
> @@ -652,7 +663,9 @@
>      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
>                     .addReg(SrcReg, getKillRegState(isKill))
>                     .addFrameIndex(FI).addReg(0).addImm(0));
> -  } else if (RC == ARM::DPRRegisterClass || RC == ARM::DPR_VFP2RegisterClass) {
> +  } else if (RC == ARM::DPRRegisterClass ||
> +             RC == ARM::DPR_VFP2RegisterClass ||
> +             RC == ARM::DPR_8RegisterClass) {
>      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD))
>                     .addReg(SrcReg, getKillRegState(isKill))
>                     .addFrameIndex(FI).addImm(0));
> @@ -678,7 +691,9 @@
>    if (RC == ARM::GPRRegisterClass) {
>      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
>                     .addFrameIndex(FI).addReg(0).addImm(0));
> -  } else if (RC == ARM::DPRRegisterClass || RC == ARM::DPR_VFP2RegisterClass) {
> +  } else if (RC == ARM::DPRRegisterClass ||
> +             RC == ARM::DPR_VFP2RegisterClass ||
> +             RC == ARM::DPR_8RegisterClass) {
>      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg)
>                     .addFrameIndex(FI).addImm(0));
>    } else if (RC == ARM::SPRRegisterClass) {
>
> Modified: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrNEON.td?rev=81220&r1=81219&r2=81220&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td (original)
> +++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td Tue Sep  8 10:22:32 2009
> @@ -475,6 +475,31 @@
>          [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
>    let isCommutable = Commutable;
>  }
> +class N3VDSL<bits<2> op21_20, bits<4> op11_8, 
> +             string OpcodeStr, ValueType Ty, SDNode ShOp>
> +  : N3V<0, 1, op21_20, op11_8, 1, 0,
> +        (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (Ty DPR:$dst),
> +              (Ty (ShOp (Ty DPR:$src1),
> +                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
> +                                          imm:$lane)))))]> {
> +  let isCommutable = 0;
> +}
> +class N3VDSL16<bits<2> op21_20, bits<4> op11_8, 
> +               string OpcodeStr, ValueType Ty, SDNode ShOp>
> +  : N3V<0, 1, op21_20, op11_8, 1, 0,
> +        (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (Ty DPR:$dst),
> +              (Ty (ShOp (Ty DPR:$src1),
> +                        (Ty (NEONvduplane (Ty DPR_8:$src2),
> +                                          imm:$lane)))))]> {
> +  let isCommutable = 0;
> +}
> +
>  class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
>             string OpcodeStr, ValueType ResTy, ValueType OpTy,
>             SDNode OpNode, bit Commutable>
> @@ -484,6 +509,30 @@
>          [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
>    let isCommutable = Commutable;
>  }
> +class N3VQSL<bits<2> op21_20, bits<4> op11_8, 
> +             string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode ShOp>
> +  : N3V<1, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (ShOp (ResTy QPR:$src1),
> +                           (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
> +                                                imm:$lane)))))]> {
> +  let isCommutable = 0;
> +}
> +class N3VQSL16<bits<2> op21_20, bits<4> op11_8, 
> +               string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode ShOp>
> +  : N3V<1, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (ShOp (ResTy QPR:$src1),
> +                           (ResTy (NEONvduplane (OpTy DPR_8:$src2),
> +                                                imm:$lane)))))]> {
> +  let isCommutable = 0;
> +}
>  
>  // Basic 3-register operations, scalar single-precision
>  class N3VDs<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
> @@ -511,6 +560,31 @@
>          [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
>    let isCommutable = Commutable;
>  }
> +class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, 
> +                string OpcodeStr, ValueType Ty, Intrinsic IntOp>
> +  : N3V<0, 1, op21_20, op11_8, 1, 0,
> +        (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (Ty DPR:$dst),
> +              (Ty (IntOp (Ty DPR:$src1),
> +                         (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
> +                                           imm:$lane)))))]> {
> +  let isCommutable = 0;
> +}
> +class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, 
> +                  string OpcodeStr, ValueType Ty, Intrinsic IntOp>
> +  : N3V<0, 1, op21_20, op11_8, 1, 0,
> +        (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (Ty DPR:$dst),
> +              (Ty (IntOp (Ty DPR:$src1),
> +                         (Ty (NEONvduplane (Ty DPR_8:$src2),
> +                                           imm:$lane)))))]> {
> +  let isCommutable = 0;
> +}
> +
>  class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
>                string OpcodeStr, ValueType ResTy, ValueType OpTy,
>                Intrinsic IntOp, bit Commutable>
> @@ -520,6 +594,30 @@
>          [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
>    let isCommutable = Commutable;
>  }
> +class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, 
> +                string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
> +  : N3V<1, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (IntOp (ResTy QPR:$src1),
> +                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
> +                                                 imm:$lane)))))]> {
> +  let isCommutable = 0;
> +}
> +class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, 
> +                  string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
> +  : N3V<1, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (IntOp (ResTy QPR:$src1),
> +                            (ResTy (NEONvduplane (OpTy DPR_8:$src2),
> +                                                 imm:$lane)))))]> {
> +  let isCommutable = 0;
> +}
>  
>  // Multiply-Add/Sub operations, both double- and quad-register.
>  class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
> @@ -529,6 +627,31 @@
>          !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
>          [(set DPR:$dst, (Ty (OpNode DPR:$src1,
>                               (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
> +class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8,
> +                  string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp>
> +  : N3V<0, 1, op21_20, op11_8, 1, 0,
> +        (outs DPR:$dst),
> +        (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
> +        [(set (Ty DPR:$dst),
> +              (Ty (ShOp (Ty DPR:$src1),
> +                        (Ty (MulOp DPR:$src2,
> +                                   (Ty (NEONvduplane (Ty DPR_VFP2:$src3),
> +                                                     imm:$lane)))))))]>;
> +class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8,
> +                    string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp>
> +  : N3V<0, 1, op21_20, op11_8, 1, 0,
> +        (outs DPR:$dst),
> +        (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
> +        [(set (Ty DPR:$dst),
> +              (Ty (ShOp (Ty DPR:$src1),
> +                        (Ty (MulOp DPR:$src2,
> +                                   (Ty (NEONvduplane (Ty DPR_8:$src3),
> +                                                     imm:$lane)))))))]>;
> +
>  class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
>                  string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
>    : N3V<op24, op23, op21_20, op11_8, 1, op4,
> @@ -536,6 +659,32 @@
>          !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
>          [(set QPR:$dst, (Ty (OpNode QPR:$src1,
>                               (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
> +class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8,
> +                  string OpcodeStr, ValueType ResTy, ValueType OpTy,
> +                  SDNode MulOp, SDNode ShOp>
> +  : N3V<1, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst),
> +        (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (ShOp (ResTy QPR:$src1),
> +                           (ResTy (MulOp QPR:$src2,
> +                                         (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3),
> +                                                              imm:$lane)))))))]>;
> +class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8,
> +                    string OpcodeStr, ValueType ResTy, ValueType OpTy,
> +                    SDNode MulOp, SDNode ShOp>
> +  : N3V<1, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst),
> +        (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane),
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (ShOp (ResTy QPR:$src1),
> +                           (ResTy (MulOp QPR:$src2,
> +                                         (ResTy (NEONvduplane (OpTy DPR_8:$src3),
> +                                                              imm:$lane)))))))]>;
>  
>  // Multiply-Add/Sub operations, scalar single-precision
>  class N3VDMulOps<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
> @@ -581,6 +730,32 @@
>          !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
>          [(set QPR:$dst,
>            (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
> +class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8,
> +                 string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
> +  : N3V<op24, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst),
> +        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), 
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (IntOp (ResTy QPR:$src1),
> +                            (OpTy DPR:$src2),
> +                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3),
> +                                                imm:$lane)))))]>;
> +class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
> +                   string OpcodeStr, ValueType ResTy, ValueType OpTy,
> +                   Intrinsic IntOp>
> +  : N3V<op24, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst),
> +        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), 
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (IntOp (ResTy QPR:$src1),
> +                            (OpTy DPR:$src2),
> +                            (OpTy (NEONvduplane (OpTy DPR_8:$src3),
> +                                                imm:$lane)))))]>;
> +
>  
>  // Narrowing 3-register intrinsics.
>  class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
> @@ -603,6 +778,27 @@
>          [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
>    let isCommutable = Commutable;
>  }
> +class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8,
> +                string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
> +  : N3V<op24, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), 
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (IntOp (OpTy DPR:$src1),
> +                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
> +                                                imm:$lane)))))]>;
> +class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
> +                  string OpcodeStr, ValueType ResTy, ValueType OpTy, 
> +                  Intrinsic IntOp>
> +  : N3V<op24, 1, op21_20, op11_8, 1, 0,
> +        (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
> +        NoItinerary,
> +        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
> +        [(set (ResTy QPR:$dst),
> +              (ResTy (IntOp (OpTy DPR:$src1),
> +                            (OpTy (NEONvduplane (OpTy DPR_8:$src2),
> +                                                imm:$lane)))))]>;
>  
>  // Wide 3-register intrinsics.
>  class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
> @@ -761,6 +957,13 @@
>                     v4i32, v4i32, OpNode, Commutable>;
>  }
>  
> +multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
> +  def v4i16 : N3VDSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
> +  def v2i32 : N3VDSL<0b10, op11_8, !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
> +  def v8i16 : N3VQSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v8i16, v4i16, ShOp>;
> +  def v4i32 : N3VQSL<0b10, op11_8, !strconcat(OpcodeStr, "32"), v4i32, v2i32, ShOp>;
> +}
> +
>  // ....then also with element size 64 bits:
>  multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
>                      string OpcodeStr, SDNode OpNode, bit Commutable = 0>
> @@ -817,6 +1020,13 @@
>                        v4i32, v4i32, IntOp, Commutable>;
>  }
>  
> +multiclass N3VIntSL_HS<bits<4> op11_8, string OpcodeStr, Intrinsic IntOp> {
> +  def v4i16 : N3VDIntSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v4i16, IntOp>;
> +  def v2i32 : N3VDIntSL<0b10, op11_8, !strconcat(OpcodeStr, "32"), v2i32, IntOp>;
> +  def v8i16 : N3VQIntSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v8i16, v4i16, IntOp>;
> +  def v4i32 : N3VQIntSL<0b10, op11_8, !strconcat(OpcodeStr, "32"), v4i32, v2i32, IntOp>;
> +}
> +
>  // ....then also with element size of 8 bits:
>  multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
>                        string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
> @@ -862,6 +1072,14 @@
>                        v2i64, v2i32, IntOp, Commutable>;
>  }
>  
> +multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
> +                        string OpcodeStr, Intrinsic IntOp> {
> +  def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, 
> +                          !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
> +  def v2i32 : N3VLIntSL<op24, 0b10, op11_8, 
> +                        !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
> +}
> +
>  // ....then also with element size of 8 bits:
>  multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
>                         string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
> @@ -905,6 +1123,16 @@
>                          !strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>;
>  }
>  
> +multiclass N3VMulOpSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
> +  def v4i16 : N3VDMulOpSL16<0b01, op11_8,
> +                            !strconcat(OpcodeStr, "16"), v4i16, mul, ShOp>;
> +  def v2i32 : N3VDMulOpSL<0b10, op11_8,
> +                          !strconcat(OpcodeStr, "32"), v2i32, mul, ShOp>;
> +  def v8i16 : N3VQMulOpSL16<0b01, op11_8,
> +                            !strconcat(OpcodeStr, "16"), v8i16, v4i16, mul, ShOp>;
> +  def v4i32 : N3VQMulOpSL<0b10, op11_8,
> +                          !strconcat(OpcodeStr, "32"), v4i32, v2i32, mul, ShOp>;
> +}
>  
>  // Neon 3-argument intrinsics,
>  //   element sizes of 8, 16 and 32 bits:
> @@ -939,6 +1167,14 @@
>                         !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
>  }
>  
> +multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
> +                         string OpcodeStr, Intrinsic IntOp> {
> +  def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8,
> +                           !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
> +  def v2i32 : N3VLInt3SL<op24, 0b10, op11_8,
> +                         !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
> +}
> +
>  // ....then also with element size of 8 bits:
>  multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
>                          string OpcodeStr, Intrinsic IntOp>
> @@ -1134,17 +1370,71 @@
>                          int_arm_neon_vmulp, 1>;
>  def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>;
>  def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>;
> +defm VMULsl  : N3VSL_HS<0b1000, "vmul.i", mul>;
> +def VMULslfd : N3VDSL<0b10, 0b1001, "vmul.f32", v2f32, fmul>;
> +def VMULslfq : N3VQSL<0b10, 0b1001, "vmul.f32", v4f32, v2f32, fmul>;
> +def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
> +                      (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
> +          (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
> +                              (v4i16 (EXTRACT_SUBREG QPR:$src2,
> +                                                     (DSubReg_i16_reg imm:$lane))),
> +                              (SubReg_i16_lane imm:$lane)))>;
> +def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
> +                      (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
> +          (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
> +                              (v2i32 (EXTRACT_SUBREG QPR:$src2,
> +                                                     (DSubReg_i32_reg imm:$lane))),
> +                              (SubReg_i32_lane imm:$lane)))>;
> +def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
> +                       (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
> +          (v4f32 (VMULslfq (v4f32 QPR:$src1),
> +                           (v2f32 (EXTRACT_SUBREG QPR:$src2,
> +                                                  (DSubReg_i32_reg imm:$lane))),
> +                           (SubReg_i32_lane imm:$lane)))>;
> +
>  //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
>  defm VQDMULH  : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>;
> +defm VQDMULHsl: N3VIntSL_HS<0b1100, "vqdmulh.s",  int_arm_neon_vqdmulh>;
> +def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
> +                                       (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
> +          (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
> +                                 (v4i16 (EXTRACT_SUBREG QPR:$src2,
> +                                                        (DSubReg_i16_reg imm:$lane))),
> +                                 (SubReg_i16_lane imm:$lane)))>;
> +def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
> +                                       (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
> +          (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
> +                                 (v2i32 (EXTRACT_SUBREG QPR:$src2,
> +                                                        (DSubReg_i32_reg imm:$lane))),
> +                                 (SubReg_i32_lane imm:$lane)))>;
> +
>  //   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
> -defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
> +defm VQRDMULH   : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
> +defm VQRDMULHsl : N3VIntSL_HS<0b1101, "vqrdmulh.s",  int_arm_neon_vqrdmulh>;
> +def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
> +                                        (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
> +          (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
> +                                  (v4i16 (EXTRACT_SUBREG QPR:$src2,
> +                                                         (DSubReg_i16_reg imm:$lane))),
> +                                  (SubReg_i16_lane imm:$lane)))>;
> +def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
> +                                        (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
> +          (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
> +                                  (v2i32 (EXTRACT_SUBREG QPR:$src2,
> +                                                         (DSubReg_i32_reg imm:$lane))),
> +                                  (SubReg_i32_lane imm:$lane)))>;
> +
>  //   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
>  defm VMULLs   : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>;
>  defm VMULLu   : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>;
>  def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8,
>                          int_arm_neon_vmullp, 1>;
> +defm VMULLsls : N3VLIntSL_HS<0, 0b1010, "vmull.s", int_arm_neon_vmulls>;
> +defm VMULLslu : N3VLIntSL_HS<1, 0b1010, "vmull.u", int_arm_neon_vmullu>;
> +
>  //   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
>  defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>;
> +defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, "vqdmull.s", int_arm_neon_vqdmull>;
>  
>  // Vector Multiply-Accumulate and Multiply-Subtract Operations.
>  
> @@ -1152,20 +1442,93 @@
>  defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>;
>  def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>;
>  def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>;
> +defm VMLAsl   : N3VMulOpSL_HS<0b0000, "vmla.i", add>;
> +def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, "vmla.f32", v2f32, fmul, fadd>;
> +def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, "vmla.f32", v4f32, v2f32, fmul, fadd>;
> +
> +def : Pat<(v8i16 (add (v8i16 QPR:$src1),
> +                      (mul (v8i16 QPR:$src2),
> +                           (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
> +          (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1),
> +                              (v8i16 QPR:$src2),
> +                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
> +                                                     (DSubReg_i16_reg imm:$lane))),
> +                              (SubReg_i16_lane imm:$lane)))>;
> +
> +def : Pat<(v4i32 (add (v4i32 QPR:$src1),
> +                      (mul (v4i32 QPR:$src2),
> +                           (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
> +          (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1),
> +                              (v4i32 QPR:$src2),
> +                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
> +                                                     (DSubReg_i32_reg imm:$lane))),
> +                              (SubReg_i32_lane imm:$lane)))>;
> +
> +def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
> +                       (fmul (v4f32 QPR:$src2),
> +                             (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
> +          (v4f32 (VMLAslfq (v4f32 QPR:$src1),
> +                           (v4f32 QPR:$src2),
> +                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
> +                                                  (DSubReg_i32_reg imm:$lane))),
> +                           (SubReg_i32_lane imm:$lane)))>;
> +
>  //   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
>  defm VMLALs   : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>;
>  defm VMLALu   : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>;
> +
> +defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal.s", int_arm_neon_vmlals>;
> +defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal.u", int_arm_neon_vmlalu>;
> +
>  //   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
>  defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>;
> +defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal.s", int_arm_neon_vqdmlal>;
> +
>  //   VMLS     : Vector Multiply Subtract (integer and floating-point)
>  defm VMLS     : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>;
>  def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>;
>  def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>;
> +defm VMLSsl   : N3VMulOpSL_HS<0b0100, "vmls.i", sub>;
> +def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, "vmls.f32", v2f32, fmul, fsub>;
> +def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, "vmls.f32", v4f32, v2f32, fmul, fsub>;
> +
> +def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
> +                      (mul (v8i16 QPR:$src2),
> +                           (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
> +          (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1),
> +                              (v8i16 QPR:$src2),
> +                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
> +                                                     (DSubReg_i16_reg imm:$lane))),
> +                              (SubReg_i16_lane imm:$lane)))>;
> +
> +def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
> +                      (mul (v4i32 QPR:$src2),
> +                           (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
> +          (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1),
> +                              (v4i32 QPR:$src2),
> +                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
> +                                                     (DSubReg_i32_reg imm:$lane))),
> +                              (SubReg_i32_lane imm:$lane)))>;
> +
> +def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
> +                       (fmul (v4f32 QPR:$src2),
> +                             (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
> +          (v4f32 (VMLSslfq (v4f32 QPR:$src1),
> +                           (v4f32 QPR:$src2),
> +                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
> +                                                  (DSubReg_i32_reg imm:$lane))),
> +                           (SubReg_i32_lane imm:$lane)))>;
> +
>  //   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
>  defm VMLSLs   : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>;
>  defm VMLSLu   : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>;
> +
> +defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl.s", int_arm_neon_vmlsls>;
> +defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl.u", int_arm_neon_vmlslu>;
> +
>  //   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
>  defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
> +defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
>  
>  // Vector Subtract Operations.
>  
>
> Modified: llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td?rev=81220&r1=81219&r2=81220&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td (original)
> +++ llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td Tue Sep  8 10:22:32 2009
> @@ -61,7 +61,7 @@
>  
>  // Aliases of the F* registers used to hold 64-bit fp values (doubles)
>  def D0  : ARMReg< 0,  "d0", [S0,   S1]>;
> -def D1  : ARMReg< 1,  "d1", [S2,   S3]>; 
> +def D1  : ARMReg< 1,  "d1", [S2,   S3]>;
>  def D2  : ARMReg< 2,  "d2", [S4,   S5]>;
>  def D3  : ARMReg< 3,  "d3", [S6,   S7]>;
>  def D4  : ARMReg< 4,  "d4", [S8,   S9]>;
> @@ -89,7 +89,7 @@
>  
>  // Advanced SIMD (NEON) defines 16 quad-word aliases
>  def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
> -def Q1  : ARMReg< 1,  "q1", [D2,   D3]>; 
> +def Q1  : ARMReg< 1,  "q1", [D2,   D3]>;
>  def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
>  def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
>  def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
> @@ -269,16 +269,16 @@
>    }];
>    let MethodBodies = [{
>      // VFP2
> -    static const unsigned ARM_DPR_VFP2[] = { 
> -      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
> -      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
> -      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
> +    static const unsigned ARM_DPR_VFP2[] = {
> +      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
> +      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
> +      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
>        ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
>      // VFP3
>      static const unsigned ARM_DPR_VFP3[] = {
> -      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
> -      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
> -      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
> +      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
> +      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
> +      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
>        ARM::D12, ARM::D13, ARM::D14, ARM::D15,
>        ARM::D16, ARM::D17, ARM::D18, ARM::D19,
>        ARM::D20, ARM::D21, ARM::D22, ARM::D23,
> @@ -307,12 +307,19 @@
>  
>  // Subset of DPR that are accessible with VFP2 (and so that also have
>  // 32-bit SPR subregs).
> -def DPR_VFP2 : RegisterClass<"ARM", [f64, v2f32], 64,
> +def DPR_VFP2 : RegisterClass<"ARM", [f64, v2i32, v2f32], 64,
>                               [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
>                                D8,  D9,  D10, D11, D12, D13, D14, D15]> {
>    let SubRegClassList = [SPR, SPR];
>  }
>  
> +// Subset of DPR which can be used as a source of NEON scalars for 16-bit
> +// operations
> +def DPR_8 : RegisterClass<"ARM", [f64, v4i16, v2f32], 64,
> +                          [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7]> {
> +  let SubRegClassList = [SPR, SPR];
> +}
> +
>  // Generic 128-bit vector register class.
>  def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
>                          [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
> @@ -364,4 +371,3 @@
>                      Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
>                     [D1,  D3,  D5,  D7,  D9,  D11, D13, D15,
>                      D17, D19, D21, D23, D25, D27, D29, D31]>;
> -
>
> Added: llvm/trunk/test/CodeGen/ARM/vmlal_lane.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmlal_lane.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vmlal_lane.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vmlal_lane.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,47 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmlal_lanes16
> +; CHECK: vmlal.s16 q0, d2, d3[1]
> +  %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmlal_lanes32
> +; CHECK: vmlal.s32 q0, d2, d3[1]
> +  %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmlal_laneu16
> +; CHECK: vmlal.u16 q0, d2, d3[1]
> +  %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmlal_laneu32
> +; CHECK: vmlal.u32 q0, d2, d3[1]
> +  %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
>
> Added: llvm/trunk/test/CodeGen/ARM/vmlsl_lane.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmlsl_lane.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vmlsl_lane.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vmlsl_lane.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,47 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmlsl_lanes16
> +; CHECK: vmlsl.s16 q0, d2, d3[1]
> +  %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmlsl_lanes32
> +; CHECK: vmlsl.s32 q0, d2, d3[1]
> +  %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmlsl_laneu16
> +; CHECK: vmlsl.u16 q0, d2, d3[1]
> +  %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmlsl_laneu32
> +; CHECK: vmlsl.u32 q0, d2, d3[1]
> +  %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
>
> Added: llvm/trunk/test/CodeGen/ARM/vmul_lane.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmul_lane.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vmul_lane.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vmul_lane.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,57 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmul_lanef32:
> +; CHECK: vmul.f32 d0, d0, d1[0]
> +  %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1]
> +  %1 = fmul <2 x float> %0, %arg0_float32x2_t     ; <<2 x float>> [#uses=1]
> +  ret <2 x float> %1
> +}
> +
> +define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmul_lanes16:
> +; CHECK: vmul.i16 d0, d0, d1[1]
> +  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$
> +  %1 = mul <4 x i16> %0, %arg0_int16x4_t          ; <<4 x i16>> [#uses=1]
> +  ret <4 x i16> %1
> +}
> +
> +define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmul_lanes32:
> +; CHECK: vmul.i32 d0, d0, d1[1]
> +  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = mul <2 x i32> %0, %arg0_int32x2_t          ; <<2 x i32>> [#uses=1]
> +  ret <2 x i32> %1
> +}
> +
> +define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmulQ_lanef32:
> +; CHECK: vmul.f32 q0, q0, d2[1]
> +  %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$
> +  %1 = fmul <4 x float> %0, %arg0_float32x4_t     ; <<4 x float>> [#uses=1]
> +  ret <4 x float> %1
> +}
> +
> +define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmulQ_lanes16:
> +; CHECK: vmul.i16 q0, q0, d2[1]
> +  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
> +  %1 = mul <8 x i16> %0, %arg0_int16x8_t          ; <<8 x i16>> [#uses=1]
> +  ret <8 x i16> %1
> +}
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmulQ_lanes32:
> +; CHECK: vmul.i32 q0, q0, d2[1]
> +  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$
> +  %1 = mul <4 x i32> %0, %arg0_int32x4_t          ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
>
> Added: llvm/trunk/test/CodeGen/ARM/vmull_lane.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmull_lane.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vmull_lane.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vmull_lane.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,47 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmull_lanes16
> +; CHECK: vmull.s16 q0, d0, d1[1]
> +  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmull_lanes32
> +; CHECK: vmull.s32 q0, d0, d1[1]
> +  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmull_laneu16
> +; CHECK: vmull.u16 q0, d0, d1[1]
> +  %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vmull_laneu32
> +; CHECK: vmull.u32 q0, d0, d1[1]
> +  %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
>
> Added: llvm/trunk/test/CodeGen/ARM/vqRdmulh_lane.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vqRdmulh_lane.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vqRdmulh_lane.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vqRdmulh_lane.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,47 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqRdmulhQ_lanes16
> +; CHECK: vqrdmulh.s16 q0, q0, d2[1]
> +  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
> +  %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
> +  ret <8 x i16> %1
> +}
> +
> +declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqRdmulhQ_lanes32
> +; CHECK: vqrdmulh.s32 q0, q0, d2[1]
> +  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqRdmulh_lanes16
> +; CHECK: vqrdmulh.s16 d0, d0, d1[1]
> +  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
> +  ret <4 x i16> %1
> +}
> +
> +declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqRdmulh_lanes32
> +; CHECK: vqrdmulh.s32 d0, d0, d1[1]
> +  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
> +  ret <2 x i32> %1
> +}
> +
> +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
>
> Added: llvm/trunk/test/CodeGen/ARM/vqdmlal_lanes.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vqdmlal_lanes.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vqdmlal_lanes.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vqdmlal_lanes.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,25 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmlal_lanes16
> +; CHECK: vqdmlal.s16 q0, d2, d3[1]
> +  %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmlal_lanes32
> +; CHECK: vqdmlal.s32 q0, d2, d3[1]
> +  %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
>
> Added: llvm/trunk/test/CodeGen/ARM/vqdmlsl_lanes.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vqdmlsl_lanes.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vqdmlsl_lanes.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vqdmlsl_lanes.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,25 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmlsl_lanes16
> +; CHECK: vqdmlsl.s16 q0, d2, d3[1]
> +  %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmlsl_lanes32
> +; CHECK: vqdmlsl.s32 q0, d2, d3[1]
> +  %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
>
> Added: llvm/trunk/test/CodeGen/ARM/vqdmulh_lane.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vqdmulh_lane.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vqdmulh_lane.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vqdmulh_lane.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,47 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmulhQ_lanes16
> +; CHECK: vqdmulh.s16 q0, q0, d2[1]
> +  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
> +  %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
> +  ret <8 x i16> %1
> +}
> +
> +declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmulhQ_lanes32
> +; CHECK: vqdmulh.s32 q0, q0, d2[1]
> +  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmulh_lanes16
> +; CHECK: vqdmulh.s16 d0, d0, d1[1]
> +  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
> +  ret <4 x i16> %1
> +}
> +
> +declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmulh_lanes32
> +; CHECK: vqdmulh.s32 d0, d0, d1[1]
> +  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
> +  ret <2 x i32> %1
> +}
> +
> +declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
>
> Added: llvm/trunk/test/CodeGen/ARM/vqdmull_lane.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vqdmull_lane.ll?rev=81220&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vqdmull_lane.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/vqdmull_lane.ll Tue Sep  8 10:22:32 2009
> @@ -0,0 +1,25 @@
> +; RUN: llc -mattr=+neon < %s | FileCheck %s
> +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
> +target triple = "thumbv7-elf"
> +
> +define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmull_lanes16
> +; CHECK: vqdmull.s16 q0, d0, d1[1]
> +  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
> +  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
> +  ret <4 x i32> %1
> +}
> +
> +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
> +
> +define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
> +entry:
> +; CHECK: test_vqdmull_lanes32
> +; CHECK: vqdmull.s32 q0, d0, d1[1]
> +  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
> +  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
> +  ret <2 x i64> %1
> +}
> +
> +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>