[PATCH] D35320: [X86][SSE] Add support for extending bool vectors bitcasted from scalars.

Elena Demikhovsky via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 18 05:38:12 PDT 2017


delena added a comment.

In https://reviews.llvm.org/D35320#811326, @RKSimon wrote:

> This required a modification to LowerVSETCC to reverse a dag combine to: (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. This avoids an inversion and the creation of 0/-1 bits vectors.


Can you do this in a separate commit?



================
Comment at: lib/Target/X86/X86ISelLowering.cpp:34419
+  for (int i = 0; i != NumElts; ++i) {
+    int BitIdx = (i % EltSizeInBits);
+    APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
----------------
EltSizeInBits should be equal to NumElts.


================
Comment at: test/CodeGen/X86/bitcast-int-to-vector-bool.ll:237
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:  .Lcfi0:
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:  .Lcfi1:
-; AVX2-NEXT:    .cfi_offset %rbp, -16
-; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:  .Lcfi2:
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
-; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $32, %rsp
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $17, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    movl %edi, %ecx
-; AVX2-NEXT:    shrl $16, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm0
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $18, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $19, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $20, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $21, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $22, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $23, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $24, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $25, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $26, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $27, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $28, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $29, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $30, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $31, %eax
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    movl %edi, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm1
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $2, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $3, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $4, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $5, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $6, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $7, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $8, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $9, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $10, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $11, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $12, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $13, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrl $14, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    shrl $15, %edi
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
----------------
I can give a better sequence here:
vmovd   %edi, %xmm0
shrl    $16, %edi
vmovd   %edi, %xmm1
vinserti128     $1, %xmm1, %ymm0, %ymm0
vpshufb .LCPI0_0(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17]

It comes from 
define <32 x i8> @foo(i32 %a) {
  %b = lshr i32 %a, 16
  %vec = insertelement <8 x i32>undef, i32 %a, i32 0
  %vec1 = insertelement <8 x i32>%vec, i32 %b, i32 4
  %nvec = bitcast<8 x i32>%vec1 to <32 x i8>
  %res = shufflevector <32 x i8> %nvec, <32 x i8>undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, 
                                                                    i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, 
                                                                    i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, 
                                                                    i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
  ret <32 x i8> %res
}

But I agree is this optimization will not be a part of this patch.


Repository:
  rL LLVM

https://reviews.llvm.org/D35320





More information about the llvm-commits mailing list