[PATCH] D48936: [X86][SSE] Prefer BLEND(SHL(v, c1), SHL(v, c2)) over MUL(v, c3)

Wed Jul 4 06:40:46 PDT 2018

RKSimon created this revision.
RKSimon added reviewers: craig.topper, spatel, andreadb, lebedev.ri.

Now that https://reviews.llvm.org/rL336250 has (hopefully) landed, I'd like to prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we do increase register pressure,. The code size will go up a little but by less than what we save on the constant pool data.


Repository:
  rL LLVM

https://reviews.llvm.org/D48936

Files:
  lib/Target/X86/X86ISelLowering.cpp
  test/CodeGen/X86/combine-shl.ll
  test/CodeGen/X86/vec_shift6.ll
  test/CodeGen/X86/widen_arith-4.ll


Index: test/CodeGen/X86/widen_arith-4.ll
===================================================================

--- test/CodeGen/X86/widen_arith-4.ll
+++ test/CodeGen/X86/widen_arith-4.ll
@@ -14,19 +14,21 @@
 ; CHECK-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u>
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movslq -{{[0-9]+}}(%rsp), %rax
 ; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    shlq $4, %rax
 ; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    movdqa (%rdx,%rax), %xmm2
-; CHECK-NEXT:    psubw %xmm0, %xmm2
-; CHECK-NEXT:    pmullw %xmm1, %xmm2
-; CHECK-NEXT:    pextrw $4, %xmm2, 8(%rcx,%rax)
+; CHECK-NEXT:    movdqa (%rdx,%rax), %xmm1
+; CHECK-NEXT:    psubw %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    psllw $2, %xmm2
+; CHECK-NEXT:    psllw $1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
+; CHECK-NEXT:    pextrw $4, %xmm1, 8(%rcx,%rax)
 ; CHECK-NEXT:    movq %xmm2, (%rcx,%rax)
 ; CHECK-NEXT:    incl -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:  .LBB0_1: # %forcond
Index: test/CodeGen/X86/vec_shift6.ll
===================================================================
--- test/CodeGen/X86/vec_shift6.ll
+++ test/CodeGen/X86/vec_shift6.ll
@@ -71,7 +71,10 @@
 define <4 x i32> @test4(<4 x i32> %a) {
 ; SSE-LABEL: test4:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pslld $1, %xmm1
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: test4:
Index: test/CodeGen/X86/combine-shl.ll
===================================================================
--- test/CodeGen/X86/combine-shl.ll
+++ test/CodeGen/X86/combine-shl.ll
@@ -212,8 +212,14 @@
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE-NEXT:    pmovsxwd %xmm1, %xmm1
 ; SSE-NEXT:    pmovsxwd %xmm0, %xmm0
-; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
-; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pslld $30, %xmm2
+; SSE-NEXT:    pslld $31, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pslld $28, %xmm2
+; SSE-NEXT:    pslld $29, %xmm1
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_shl_ext_shl1:
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -23434,12 +23434,6 @@
     return R;
   }
 
-  // If possible, lower this packed shift into a vector multiply instead of
-  // expanding it into a sequence of scalar shifts.
-  if (Op.getOpcode() == ISD::SHL)
-    if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
-      return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
-
   // If possible, lower this shift as a sequence of two shifts by
   // constant plus a BLENDing shuffle instead of scalarizing it.
   // Example:
@@ -23485,6 +23479,12 @@
     }
   }
 
+  // If possible, lower this packed shift into a vector multiply instead of
+  // expanding it into a sequence of scalar shifts.
+  if (Op.getOpcode() == ISD::SHL)
+    if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
+      return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
+
   // v4i32 Non Uniform Shifts.
   // If the shift amount is constant we can shift each lane using the SSE2
   // immediate shifts, else we need to zero-extend each lane to the lower i64


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D48936.154102.patch
Type: text/x-patch
Size: 3988 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180704/7d1967b2/attachment.bin>