[llvm] r207291 - Optimization for certain shufflevector by using insertps.
Filipe Cabecinhas
me at filcab.net
Fri Apr 25 16:51:18 PDT 2014
Author: filcab
Date: Fri Apr 25 18:51:17 2014
New Revision: 207291
URL: http://llvm.org/viewvc/llvm-project?rev=207291&view=rev
Log:
Optimization for certain shufflevector by using insertps.
Summary:
If we're doing a v4f32/v4i32 shuffle on x86 with SSE4.1, we can lower
certain shufflevectors to an insertps instruction:
When most of the shufflevector result's elements come from one vector (and
keep their index), and one element comes from another vector or a memory
operand.
Added tests for insertps optimizations on shufflevector.
Added support and tests for v4i32 vector optimization.
Reviewers: nadav
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D3475
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/sse41.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=207291&r1=207290&r2=207291&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Apr 25 18:51:17 2014
@@ -3931,6 +3931,29 @@ static bool isMOVLHPSMask(ArrayRef<int>
return true;
}
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+ // TODO: Deal with AVX's VINSERTPS
+ if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+ return false;
+
+ unsigned CorrectPosV1 = 0;
+ unsigned CorrectPosV2 = 0;
+ for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+ if (Mask[i] == i)
+ ++CorrectPosV1;
+ else if (Mask[i] == i + 4)
+ ++CorrectPosV2;
+
+ if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+ // We have 3 elements from one vector, and one from another.
+ return true;
+
+ return false;
+}
+
//
// Some special combinations that can be optimized.
//
@@ -7263,6 +7286,84 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl,
getShuffleSHUFImmediate(SVOp), DAG);
}
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+ SelectionDAG &DAG) {
+ // Generate an insertps instruction when inserting an f32 from memory onto a
+ // v4f32 or when copying a member from one v4f32 to another.
+ // We also use it for transferring i32 from one register to another,
+ // since it simply copies the same bits.
+ // If we're transfering an i32 from memory to a specific element in a
+ // register, we output a generic DAG that will match the PINSRD
+ // instruction.
+ // TODO: Optimize for AVX cases too (VINSERTPS)
+ MVT VT = SVOp->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ auto Mask = SVOp->getMask();
+ assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+ "unsupported vector type for insertps/pinsrd");
+
+ int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i < 4; });
+
+ SDValue From;
+ SDValue To;
+ unsigned DestIndex;
+ if (FromV1 == 1) {
+ From = V1;
+ To = V2;
+ DestIndex = std::find_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i < 4; }) -
+ Mask.begin();
+ } else {
+ From = V2;
+ To = V1;
+ DestIndex = std::find_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i >= 4; }) -
+ Mask.begin();
+ }
+
+ if (MayFoldLoad(From)) {
+ // Trivial case, when From comes from a load and is only used by the
+ // shuffle. Make it use insertps from the vector that we need from that
+ // load.
+ SDValue Addr = From.getOperand(1);
+ SDValue NewAddr =
+ DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+ DAG.getConstant(DestIndex * EVT.getStoreSize(),
+ Addr.getSimpleValueType()));
+
+ LoadSDNode *Load = cast<LoadSDNode>(From);
+ SDValue NewLoad =
+ DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Load->getMemOperand(), 0, EVT.getStoreSize()));
+
+ if (EVT == MVT::f32) {
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+ SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+ InsertpsMask);
+ } else { // EVT == MVT::i32
+ // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+ // instruction, to match the PINSRD instruction, which loads an i32 to a
+ // certain vector element.
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+ DAG.getConstant(DestIndex, MVT::i32));
+ }
+ }
+
+ // Vector-element-to-vector
+ unsigned SrcIndex = Mask[DestIndex] % 4;
+ SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
// Reduce a vector shuffle to zext.
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
@@ -7674,6 +7775,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(S
if (BlendOp.getNode())
return BlendOp;
+ if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+ return getINSERTPS(SVOp, dl, DAG);
+
unsigned Imm8;
if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
Modified: llvm/trunk/test/CodeGen/X86/sse41.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse41.ll?rev=207291&r1=207290&r2=207291&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse41.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse41.ll Fri Apr 25 18:51:17 2014
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
@g16 = external global i16
@@ -249,3 +249,74 @@ entry:
; X64: ret
}
+define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+entry:
+ %0 = load <4 x float>* %pb, align 16
+ %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_1:
+; CHECK-NOT: shufps
+; CHECK: insertps $48,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+entry:
+ %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+ ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_2:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps $96,
+; CHECK: ret
+}
+
+; For loading an i32 from memory into an xmm register we use pinsrd
+; instead of insertps
+define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+entry:
+ %0 = load <4 x i32>* %pb, align 16
+ %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x i32> %vecinit6
+; CHECK-LABEL: pinsrd_from_shufflevector_i32:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: pinsrd $3,
+; CHECK: ret
+}
+
+define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+entry:
+ %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+ ret <4 x i32> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_i32_2:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps $208,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps $16,
+; CHECK: ret
+ %1 = load float* %b, align 4
+ %2 = insertelement <4 x float> undef, float %1, i32 0
+ %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
+; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
+;; aCHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps $32,
+; CHECK: ret
+ %1 = load i32* %b, align 4
+ %2 = insertelement <4 x i32> undef, i32 %1, i32 0
+ %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+ ret <4 x i32> %result
+}
More information about the llvm-commits
mailing list