[llvm] r232850 - [X86] Prefer blendps over insertps codegen for one special case
Sanjay Patel
spatel at rotateright.com
Fri Mar 20 14:19:52 PDT 2015
Author: spatel
Date: Fri Mar 20 16:19:52 2015
New Revision: 232850
URL: http://llvm.org/viewvc/llvm-project?rev=232850&view=rev
Log:
[X86] Prefer blendps over insertps codegen for one special case
With this patch, for this one exact case, we'll generate:
blendps %xmm0, %xmm1, $1
instead of:
insertps %xmm0, %xmm1, $0
If there's a memory operand available for load folding and we're
optimizing for size, we'll still generate the insertps.
The detailed performance data motivation for this may be found in D7866;
in summary, blendps has 2-3x throughput vs. insertps on widely used chips.
Differential Revision: http://reviews.llvm.org/D8332
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/sse41.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=232850&r1=232849&r2=232850&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Mar 20 16:19:52 2015
@@ -10550,16 +10550,29 @@ SDValue X86TargetLowering::LowerINSERT_V
}
if (EltVT == MVT::f32) {
- // Bits [7:6] of the constant are the source select. This will always be
- // zero here. The DAG Combiner may combine an extract_elt index into
- // these
- // bits. For example (insert (extract, 3), 2) could be matched by
- // putting
- // the '3' into bits [7:6] of X86ISD::INSERTPS.
- // Bits [5:4] of the constant are the destination select. This is the
- // value of the incoming immediate.
- // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into
+ // these bits. For example (insert (extract, 3), 2) could be matched by
+ // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool MinSize = F->hasFnAttribute(Attribute::MinSize);
+ if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ // If this is an insertion of 32-bits into the low 32-bits of
+ // a vector, we prefer to generate a blend with immediate rather
+ // than an insertps. Blends are simpler operations in hardware and so
+ // will always have equal or better performance than insertps.
+ // But if optimizing for size and there's a load folding opportunity,
+ // generate insertps because blendps does not have a 32-bit memory
+ // operand form.
+ N2 = DAG.getIntPtrConstant(1);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ }
N2 = DAG.getIntPtrConstant(IdxVal << 4);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
Modified: llvm/trunk/test/CodeGen/X86/sse41.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse41.ll?rev=232850&r1=232849&r2=232850&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse41.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse41.ll Fri Mar 20 16:19:52 2015
@@ -199,28 +199,51 @@ define <4 x float> @insertps_1(<4 x floa
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
-define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
-; X32-LABEL: insertps_2:
+; When optimizing for speed, prefer blendps over insertps even if it means we have to
+; generate a separate movss to load the scalar operand.
+define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
+; X32-LABEL: blendps_not_insertps_1:
+; X32: ## BB#0:
+; X32-NEXT: movss {{.*#+}} xmm1
+; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: blendps_not_insertps_1:
+; X64: ## BB#0:
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
+ ret <4 x float> %tmp1
+}
+
+; When optimizing for size, generate an insertps if there's a load fold opportunity.
+; The difference between i386 and x86-64 ABIs for the float operand means we should
+; generate an insertps for X32 but not for X64!
+define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
+; X32-LABEL: insertps_or_blendps:
; X32: ## BB#0:
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X32-NEXT: retl
;
-; X64-LABEL: insertps_2:
+; X64-LABEL: insertps_or_blendps:
; X64: ## BB#0:
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
ret <4 x float> %tmp1
}
-define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
-; X32-LABEL: insertps_3:
+
+; An insert into the low 32-bits of a vector from the low 32-bits of another vector
+; is always just a blendps because blendps is never more expensive than insertps.
+define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
+; X32-LABEL: blendps_not_insertps_2:
; X32: ## BB#0:
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
-; X64-LABEL: insertps_3:
+; X64-LABEL: blendps_not_insertps_2:
; X64: ## BB#0:
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp2 = extractelement <4 x float> %t2, i32 0
%tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
More information about the llvm-commits
mailing list