[PATCH] D35320: [X86][SSE] Add support for extending bool vectors bitcasted from scalars.

Tue Jul 18 05:38:12 PDT 2017

delena added a comment.

In https://reviews.llvm.org/D35320#811326, @RKSimon wrote:

> This required a modification to LowerVSETCC to reverse a dag combine to: (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. This avoids an inversion and the creation of 0/-1 bits vectors.

Can you do this in a separate commit?

================
Comment at: lib/Target/X86/X86ISelLowering.cpp:34419
+  for (int i = 0; i != NumElts; ++i) {
+    int BitIdx = (i % EltSizeInBits);
+    APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
----------------
EltSizeInBits should be equal to NumElts.

================
Comment at: test/CodeGen/X86/bitcast-int-to-vector-bool.ll:237
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:  .Lcfi0:
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:  .Lcfi1:
-; AVX2-NEXT:    .cfi_offset %rbp, -16
-; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:  .Lcfi2:
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
-; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $32, %rsp
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $17, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    movl %edi, %ecx
-; AVX2-NEXT:    shrl $16, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm0
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $18, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $19, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $20, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $21, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $22, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $23, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $24, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $25, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $26, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $27, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $28, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $29, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $30, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $31, %eax
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    movl %edi, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm1
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $2, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $3, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $4, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $5, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $6, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $7, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $8, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $9, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $10, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $11, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $12, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $13, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $14, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    shrl $15, %edi
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
----------------
I can give a better sequence here:
vmovd   %edi, %xmm0
shrl    $16, %edi
vmovd   %edi, %xmm1
vinserti128     $1, %xmm1, %ymm0, %ymm0
vpshufb .LCPI0_0(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17]

It comes from 
define <32 x i8> @foo(i32 %a) {
  %b = lshr i32 %a, 16
  %vec = insertelement <8 x i32>undef, i32 %a, i32 0
  %vec1 = insertelement <8 x i32>%vec, i32 %b, i32 4
  %nvec = bitcast<8 x i32>%vec1 to <32 x i8>
  %res = shufflevector <32 x i8> %nvec, <32 x i8>undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, 
                                                                    i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, 
                                                                    i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, 
                                                                    i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
  ret <32 x i8> %res
}

But I agree is this optimization will not be a part of this patch.

Repository:
  rL LLVM

https://reviews.llvm.org/D35320