[PATCH] D35320: [X86][SSE] Add support for extending bool vectors bitcasted from scalars.
Elena Demikhovsky via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 18 05:38:12 PDT 2017
delena added a comment.
In https://reviews.llvm.org/D35320#811326, @RKSimon wrote:
> This required a modification to LowerVSETCC to reverse a dag combine to: (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. This avoids an inversion and the creation of 0/-1 bits vectors.
Can you do this in a separate commit?
================
Comment at: lib/Target/X86/X86ISelLowering.cpp:34419
+ for (int i = 0; i != NumElts; ++i) {
+ int BitIdx = (i % EltSizeInBits);
+ APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
----------------
EltSizeInBits should be equal to NumElts.
================
Comment at: test/CodeGen/X86/bitcast-int-to-vector-bool.ll:237
; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi0:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi1:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi2:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $32, %rsp
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $17, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $18, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $19, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $20, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $21, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $22, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $23, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $25, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $26, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $27, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $28, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $29, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $30, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $31, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm1
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $2, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $4, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $5, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $6, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $7, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $8, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $9, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $10, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $11, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $12, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $13, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $14, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shrl $15, %edi
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
----------------
I can give a better sequence here:
vmovd %edi, %xmm0
shrl $16, %edi
vmovd %edi, %xmm1
vinserti128 $1, %xmm1, %ymm0, %ymm0
vpshufb .LCPI0_0(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17]
It comes from
define <32 x i8> @foo(i32 %a) {
%b = lshr i32 %a, 16
%vec = insertelement <8 x i32>undef, i32 %a, i32 0
%vec1 = insertelement <8 x i32>%vec, i32 %b, i32 4
%nvec = bitcast<8 x i32>%vec1 to <32 x i8>
%res = shufflevector <32 x i8> %nvec, <32 x i8>undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16,
i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
ret <32 x i8> %res
}
But I agree is this optimization will not be a part of this patch.
Repository:
rL LLVM
https://reviews.llvm.org/D35320
More information about the llvm-commits
mailing list