[llvm] r198743 - [AArch64 NEON] Fix generating incorrect value type of NEON_VDUPLANE
Kevin Qin
Kevin.Qin at arm.com
Wed Jan 8 00:06:15 PST 2014
Author: kevinqin
Date: Wed Jan 8 02:06:14 2014
New Revision: 198743
URL: http://llvm.org/viewvc/llvm-project?rev=198743&view=rev
Log:
[AArch64 NEON] Fix generating incorrect value type of NEON_VDUPLANE
when lower build_vector if result value type mismatch with operand
value type.
Modified:
llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/trunk/test/CodeGen/AArch64/neon-copy.ll
Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=198743&r1=198742&r2=198743&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Wed Jan 8 02:06:14 2014
@@ -4107,14 +4107,60 @@ AArch64TargetLowering::LowerBUILD_VECTOR
// just use DUPLANE. We can only do this if the lane being extracted
// is at a constant index, as the DUP from lane instructions only have
// constant-index forms.
+ //
+ // If there is a TRUNCATE between EXTRACT_VECTOR_ELT and DUP, we can
+ // remove TRUNCATE for DUPLANE by apdating the source vector to
+ // appropriate vector type and lane index.
+ //
// FIXME: for now we have v1i8, v1i16, v1i32 legal vector types, if they
// are not legal any more, no need to check the type size in bits should
// be large than 64.
- if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- isa<ConstantSDNode>(Value->getOperand(1)) &&
- Value->getOperand(0).getValueType().getSizeInBits() >= 64) {
- N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
- Value->getOperand(0), Value->getOperand(1));
+ SDValue V = Value;
+ if (Value->getOpcode() == ISD::TRUNCATE)
+ V = Value->getOperand(0);
+ if (V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(V->getOperand(1)) &&
+ V->getOperand(0).getValueType().getSizeInBits() >= 64) {
+
+ // If the element size of source vector is larger than DUPLANE
+ // element size, we can do transformation by,
+ // 1) bitcasting source register to smaller element vector
+ // 2) mutiplying the lane index by SrcEltSize/ResEltSize
+ // For example, we can lower
+ // "v8i16 vdup_lane(v4i32, 1)"
+ // to be
+ // "v8i16 vdup_lane(v8i16 bitcast(v4i32), 2)".
+ SDValue SrcVec = V->getOperand(0);
+ unsigned SrcEltSize =
+ SrcVec.getValueType().getVectorElementType().getSizeInBits();
+ unsigned ResEltSize = VT.getVectorElementType().getSizeInBits();
+ if (SrcEltSize > ResEltSize) {
+ assert((SrcEltSize % ResEltSize == 0) && "Invalid element size");
+ SDValue BitCast;
+ unsigned SrcSize = SrcVec.getValueType().getSizeInBits();
+ unsigned ResSize = VT.getSizeInBits();
+
+ if (SrcSize > ResSize) {
+ assert((SrcSize % ResSize == 0) && "Invalid vector size");
+ EVT CastVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ SrcSize / ResEltSize);
+ BitCast = DAG.getNode(ISD::BITCAST, DL, CastVT, SrcVec);
+ } else {
+ assert((SrcSize == ResSize) && "Invalid vector size of source vec");
+ BitCast = DAG.getNode(ISD::BITCAST, DL, VT, SrcVec);
+ }
+
+ unsigned LaneIdx = V->getConstantOperandVal(1);
+ SDValue Lane =
+ DAG.getConstant((SrcEltSize / ResEltSize) * LaneIdx, MVT::i64);
+ N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, BitCast, Lane);
+ } else {
+ assert((SrcEltSize == ResEltSize) &&
+ "Invalid element size of source vec");
+ N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, V->getOperand(0),
+ V->getOperand(1));
+ }
} else
N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
Modified: llvm/trunk/test/CodeGen/AArch64/neon-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/neon-copy.ll?rev=198743&r1=198742&r2=198743&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/neon-copy.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/neon-copy.ll Wed Jan 8 02:06:14 2014
@@ -726,3 +726,125 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
%vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7
ret <8 x i8> %vecinit14
}
+
+define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
+; CHECK-LABEL: test_dup_v2i32_v4i16:
+; CHECK: dup v0.4h, v0.h[2]
+entry:
+ %x = extractelement <2 x i32> %a, i32 1
+ %vget_lane = trunc i32 %x to i16
+ %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ ret <4 x i16> %vecinit3.i
+}
+
+define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
+; CHECK-LABEL: test_dup_v4i32_v8i16:
+; CHECK: dup v0.8h, v0.h[6]
+entry:
+ %x = extractelement <4 x i32> %a, i32 3
+ %vget_lane = trunc i32 %x to i16
+ %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+ %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+ %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+ %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+ ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
+; CHECK-LABEL: test_dup_v1i64_v4i16:
+; CHECK: dup v0.4h, v0.h[0]
+entry:
+ %x = extractelement <1 x i64> %a, i32 0
+ %vget_lane = trunc i64 %x to i16
+ %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
+; CHECK-LABEL: test_dup_v1i64_v2i32:
+; CHECK: dup v0.2s, v0.s[0]
+entry:
+ %x = extractelement <1 x i64> %a, i32 0
+ %vget_lane = trunc i64 %x to i32
+ %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
+ %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
+ ret <2 x i32> %vecinit1.i
+}
+
+define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v8i16:
+; CHECK: dup v0.8h, v0.h[4]
+entry:
+ %x = extractelement <2 x i64> %a, i32 1
+ %vget_lane = trunc i64 %x to i16
+ %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+ %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+ %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+ %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+ ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v4i32:
+; CHECK: dup v0.4s, v0.s[2]
+entry:
+ %x = extractelement <2 x i64> %a, i32 1
+ %vget_lane = trunc i64 %x to i32
+ %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+ ret <4 x i32> %vecinit3.i
+}
+
+define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
+; CHECK-LABEL: test_dup_v4i32_v4i16:
+; CHECK: dup v0.4h, v0.h[2]
+entry:
+ %x = extractelement <4 x i32> %a, i32 1
+ %vget_lane = trunc i32 %x to i16
+ %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ ret <4 x i16> %vecinit3.i
+}
+
+define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v4i16:
+; CHECK: dup v0.4h, v0.h[0]
+entry:
+ %x = extractelement <2 x i64> %a, i32 0
+ %vget_lane = trunc i64 %x to i16
+ %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v2i32:
+; CHECK: dup v0.2s, v0.s[0]
+entry:
+ %x = extractelement <2 x i64> %a, i32 0
+ %vget_lane = trunc i64 %x to i32
+ %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
+ %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
+ ret <2 x i32> %vecinit1.i
+}
+
More information about the llvm-commits
mailing list