[llvm] 41f4cd6 - [X86] Don't scalarize gather/scatters with non-power of 2 element counts. Widen instead.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 15 23:23:48 PDT 2020
Author: Craig Topper
Date: 2020-09-15T23:22:53-07:00
New Revision: 41f4cd60d54d94e8dac4bbd8d9961dc8ad4a64fc
URL: https://github.com/llvm/llvm-project/commit/41f4cd60d54d94e8dac4bbd8d9961dc8ad4a64fc
DIFF: https://github.com/llvm/llvm-project/commit/41f4cd60d54d94e8dac4bbd8d9961dc8ad4a64fc.diff
LOG: [X86] Don't scalarize gather/scatters with non-power of 2 element counts. Widen instead.
We can pad the mask with zeros in order to widen. We already do
this for power 2 types that are smaller than a legal type.
Added:
Modified:
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/test/CodeGen/X86/masked_gather_scatter.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 03f8be094c25..8ce9749dc2d6 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4283,7 +4283,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
// scalarize it.
if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
unsigned NumElts = DataVTy->getNumElements();
- if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ if (NumElts == 1)
return false;
}
Type *ScalarTy = DataTy->getScalarType();
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 6f2298c967e9..948928099d38 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1629,182 +1629,122 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
ret <16 x float>%res
}
-; Check non-power-of-2 case. It should be scalarized.
declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
; KNL_64-LABEL: test30:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: andb $1, %dil
-; KNL_64-NEXT: andb $1, %sil
-; KNL_64-NEXT: addb %sil, %sil
-; KNL_64-NEXT: orb %dil, %sil
-; KNL_64-NEXT: andb $1, %dl
-; KNL_64-NEXT: shlb $2, %dl
-; KNL_64-NEXT: orb %sil, %dl
+; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; KNL_64-NEXT: movw $-3, %ax
+; KNL_64-NEXT: kmovw %eax, %k0
+; KNL_64-NEXT: andl $1, %edi
+; KNL_64-NEXT: kmovw %edi, %k1
+; KNL_64-NEXT: kandw %k0, %k1, %k0
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: kshiftlw $15, %k1, %k1
+; KNL_64-NEXT: kshiftrw $14, %k1, %k1
+; KNL_64-NEXT: korw %k1, %k0, %k0
+; KNL_64-NEXT: movw $-5, %ax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: kandw %k1, %k0, %k0
+; KNL_64-NEXT: kmovw %edx, %k1
+; KNL_64-NEXT: kshiftlw $15, %k1, %k1
+; KNL_64-NEXT: kshiftrw $13, %k1, %k1
+; KNL_64-NEXT: korw %k1, %k0, %k0
+; KNL_64-NEXT: kshiftlw $12, %k0, %k0
+; KNL_64-NEXT: kshiftrw $12, %k0, %k1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; KNL_64-NEXT: testb $1, %dl
-; KNL_64-NEXT: jne .LBB31_1
-; KNL_64-NEXT: # %bb.2: # %else
-; KNL_64-NEXT: testb $2, %dl
-; KNL_64-NEXT: jne .LBB31_3
-; KNL_64-NEXT: .LBB31_4: # %else2
-; KNL_64-NEXT: testb $4, %dl
-; KNL_64-NEXT: jne .LBB31_5
-; KNL_64-NEXT: .LBB31_6: # %else5
-; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
-; KNL_64-NEXT: retq
-; KNL_64-NEXT: .LBB31_1: # %cond.load
-; KNL_64-NEXT: vmovq %xmm0, %rax
-; KNL_64-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2
-; KNL_64-NEXT: testb $2, %dl
-; KNL_64-NEXT: je .LBB31_4
-; KNL_64-NEXT: .LBB31_3: # %cond.load1
-; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
-; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
-; KNL_64-NEXT: testb $4, %dl
-; KNL_64-NEXT: je .LBB31_6
-; KNL_64-NEXT: .LBB31_5: # %cond.load4
-; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL_64-NEXT: vmovq %xmm0, %rax
-; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test30:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: pushl %eax
-; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
+; KNL_32-NEXT: movw $-3, %ax
+; KNL_32-NEXT: kmovw %eax, %k0
; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; KNL_32-NEXT: andb $1, %al
-; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %cl
-; KNL_32-NEXT: andb $1, %cl
-; KNL_32-NEXT: addb %cl, %cl
-; KNL_32-NEXT: orb %al, %cl
+; KNL_32-NEXT: andl $1, %eax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kandw %k0, %k1, %k0
; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; KNL_32-NEXT: andb $1, %al
-; KNL_32-NEXT: shlb $2, %al
-; KNL_32-NEXT: orb %cl, %al
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kshiftlw $15, %k1, %k1
+; KNL_32-NEXT: kshiftrw $14, %k1, %k1
+; KNL_32-NEXT: korw %k1, %k0, %k0
+; KNL_32-NEXT: movw $-5, %ax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kandw %k1, %k0, %k0
+; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kshiftlw $15, %k1, %k1
+; KNL_32-NEXT: kshiftrw $13, %k1, %k1
+; KNL_32-NEXT: korw %k1, %k0, %k0
+; KNL_32-NEXT: kshiftlw $12, %k0, %k0
+; KNL_32-NEXT: kshiftrw $12, %k0, %k1
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; KNL_32-NEXT: testb $1, %al
-; KNL_32-NEXT: jne .LBB31_1
-; KNL_32-NEXT: # %bb.2: # %else
-; KNL_32-NEXT: testb $2, %al
-; KNL_32-NEXT: jne .LBB31_3
-; KNL_32-NEXT: .LBB31_4: # %else2
-; KNL_32-NEXT: testb $4, %al
-; KNL_32-NEXT: je .LBB31_6
-; KNL_32-NEXT: .LBB31_5: # %cond.load4
-; KNL_32-NEXT: vpextrd $2, %xmm0, %eax
-; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
-; KNL_32-NEXT: .LBB31_6: # %else5
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: popl %eax
-; KNL_32-NEXT: .cfi_def_cfa_offset 4
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
-; KNL_32-NEXT: .LBB31_1: # %cond.load
-; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: vmovd %xmm0, %ecx
-; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
-; KNL_32-NEXT: testb $2, %al
-; KNL_32-NEXT: je .LBB31_4
-; KNL_32-NEXT: .LBB31_3: # %cond.load1
-; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
-; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2
-; KNL_32-NEXT: testb $4, %al
-; KNL_32-NEXT: jne .LBB31_5
-; KNL_32-NEXT: jmp .LBB31_6
;
; SKX-LABEL: test30:
; SKX: # %bb.0:
-; SKX-NEXT: andb $1, %dil
-; SKX-NEXT: andb $1, %sil
-; SKX-NEXT: addb %sil, %sil
-; SKX-NEXT: orb %dil, %sil
-; SKX-NEXT: andb $1, %dl
-; SKX-NEXT: shlb $2, %dl
-; SKX-NEXT: orb %sil, %dl
+; SKX-NEXT: movb $-3, %al
+; SKX-NEXT: kmovw %eax, %k0
+; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftrb $7, %k1, %k1
+; SKX-NEXT: kandw %k0, %k1, %k0
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftrb $6, %k1, %k1
+; SKX-NEXT: korw %k1, %k0, %k0
+; SKX-NEXT: movb $-5, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: kmovw %edx, %k1
+; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftrb $5, %k1, %k1
+; SKX-NEXT: korw %k1, %k0, %k1
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; SKX-NEXT: testb $1, %dl
-; SKX-NEXT: jne .LBB31_1
-; SKX-NEXT: # %bb.2: # %else
-; SKX-NEXT: testb $2, %dl
-; SKX-NEXT: jne .LBB31_3
-; SKX-NEXT: .LBB31_4: # %else2
-; SKX-NEXT: testb $4, %dl
-; SKX-NEXT: jne .LBB31_5
-; SKX-NEXT: .LBB31_6: # %else5
-; SKX-NEXT: vmovdqa %xmm2, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: .LBB31_1: # %cond.load
-; SKX-NEXT: vmovq %xmm0, %rax
-; SKX-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2
-; SKX-NEXT: testb $2, %dl
-; SKX-NEXT: je .LBB31_4
-; SKX-NEXT: .LBB31_3: # %cond.load1
-; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
-; SKX-NEXT: testb $4, %dl
-; SKX-NEXT: je .LBB31_6
-; SKX-NEXT: .LBB31_5: # %cond.load4
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vmovq %xmm0, %rax
-; SKX-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
+; SKX-NEXT: vpgatherqd (,%ymm0), %xmm2 {%k1}
; SKX-NEXT: vmovdqa %xmm2, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test30:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: pushl %eax
-; SKX_32-NEXT: .cfi_def_cfa_offset 8
+; SKX_32-NEXT: movb $-3, %al
+; SKX_32-NEXT: kmovw %eax, %k0
+; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kshiftlb $7, %k1, %k1
+; SKX_32-NEXT: kshiftrb $7, %k1, %k1
+; SKX_32-NEXT: kandw %k0, %k1, %k0
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; SKX_32-NEXT: andb $1, %al
-; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %cl
-; SKX_32-NEXT: andb $1, %cl
-; SKX_32-NEXT: addb %cl, %cl
-; SKX_32-NEXT: orb %al, %cl
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kshiftlb $7, %k1, %k1
+; SKX_32-NEXT: kshiftrb $6, %k1, %k1
+; SKX_32-NEXT: korw %k1, %k0, %k0
+; SKX_32-NEXT: movb $-5, %al
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kandw %k1, %k0, %k0
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; SKX_32-NEXT: andb $1, %al
-; SKX_32-NEXT: shlb $2, %al
-; SKX_32-NEXT: orb %cl, %al
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kshiftlb $7, %k1, %k1
+; SKX_32-NEXT: kshiftrb $5, %k1, %k1
+; SKX_32-NEXT: korw %k1, %k0, %k1
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; SKX_32-NEXT: testb $1, %al
-; SKX_32-NEXT: jne .LBB31_1
-; SKX_32-NEXT: # %bb.2: # %else
-; SKX_32-NEXT: testb $2, %al
-; SKX_32-NEXT: jne .LBB31_3
-; SKX_32-NEXT: .LBB31_4: # %else2
-; SKX_32-NEXT: testb $4, %al
-; SKX_32-NEXT: je .LBB31_6
-; SKX_32-NEXT: .LBB31_5: # %cond.load4
-; SKX_32-NEXT: vpextrd $2, %xmm0, %eax
-; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
-; SKX_32-NEXT: .LBB31_6: # %else5
+; SKX_32-NEXT: vpgatherdd (,%xmm0), %xmm2 {%k1}
; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
-; SKX_32-NEXT: popl %eax
-; SKX_32-NEXT: .cfi_def_cfa_offset 4
; SKX_32-NEXT: retl
-; SKX_32-NEXT: .LBB31_1: # %cond.load
-; SKX_32-NEXT: .cfi_def_cfa_offset 8
-; SKX_32-NEXT: vmovd %xmm0, %ecx
-; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
-; SKX_32-NEXT: testb $2, %al
-; SKX_32-NEXT: je .LBB31_4
-; SKX_32-NEXT: .LBB31_3: # %cond.load1
-; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
-; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2
-; SKX_32-NEXT: testb $4, %al
-; SKX_32-NEXT: jne .LBB31_5
-; SKX_32-NEXT: jmp .LBB31_6
%sext_ind = sext <3 x i32> %ind to <3 x i64>
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
@@ -1817,174 +1757,113 @@ declare void @llvm.masked.scatter.v3i32.v3p0i32(<3 x i32>, <3 x i32*>, i32, <3 x
define void @test30b(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
; KNL_64-LABEL: test30b:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: andb $1, %dil
-; KNL_64-NEXT: andb $1, %sil
-; KNL_64-NEXT: addb %sil, %sil
-; KNL_64-NEXT: orb %dil, %sil
-; KNL_64-NEXT: andb $1, %dl
-; KNL_64-NEXT: shlb $2, %dl
-; KNL_64-NEXT: orb %sil, %dl
+; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; KNL_64-NEXT: movw $-3, %ax
+; KNL_64-NEXT: kmovw %eax, %k0
+; KNL_64-NEXT: andl $1, %edi
+; KNL_64-NEXT: kmovw %edi, %k1
+; KNL_64-NEXT: kandw %k0, %k1, %k0
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: kshiftlw $15, %k1, %k1
+; KNL_64-NEXT: kshiftrw $14, %k1, %k1
+; KNL_64-NEXT: korw %k1, %k0, %k0
+; KNL_64-NEXT: movw $-5, %ax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: kandw %k1, %k0, %k0
+; KNL_64-NEXT: kmovw %edx, %k1
+; KNL_64-NEXT: kshiftlw $15, %k1, %k1
+; KNL_64-NEXT: kshiftrw $13, %k1, %k1
+; KNL_64-NEXT: korw %k1, %k0, %k0
+; KNL_64-NEXT: kshiftlw $12, %k0, %k0
+; KNL_64-NEXT: kshiftrw $12, %k0, %k1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; KNL_64-NEXT: testb $1, %dl
-; KNL_64-NEXT: jne .LBB32_1
-; KNL_64-NEXT: # %bb.2: # %else
-; KNL_64-NEXT: testb $2, %dl
-; KNL_64-NEXT: jne .LBB32_3
-; KNL_64-NEXT: .LBB32_4: # %else2
-; KNL_64-NEXT: testb $4, %dl
-; KNL_64-NEXT: jne .LBB32_5
-; KNL_64-NEXT: .LBB32_6: # %else4
-; KNL_64-NEXT: vzeroupper
-; KNL_64-NEXT: retq
-; KNL_64-NEXT: .LBB32_1: # %cond.store
-; KNL_64-NEXT: vmovq %xmm0, %rax
-; KNL_64-NEXT: vmovss %xmm2, (%rax)
-; KNL_64-NEXT: testb $2, %dl
-; KNL_64-NEXT: je .LBB32_4
-; KNL_64-NEXT: .LBB32_3: # %cond.store1
-; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
-; KNL_64-NEXT: vextractps $1, %xmm2, (%rax)
-; KNL_64-NEXT: testb $4, %dl
-; KNL_64-NEXT: je .LBB32_6
-; KNL_64-NEXT: .LBB32_5: # %cond.store3
-; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL_64-NEXT: vmovq %xmm0, %rax
-; KNL_64-NEXT: vextractps $2, %xmm2, (%rax)
+; KNL_64-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test30b:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: pushl %eax
-; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
+; KNL_32-NEXT: movw $-3, %ax
+; KNL_32-NEXT: kmovw %eax, %k0
; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; KNL_32-NEXT: andb $1, %al
-; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %cl
-; KNL_32-NEXT: andb $1, %cl
-; KNL_32-NEXT: addb %cl, %cl
-; KNL_32-NEXT: orb %al, %cl
+; KNL_32-NEXT: andl $1, %eax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kandw %k0, %k1, %k0
; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; KNL_32-NEXT: andb $1, %al
-; KNL_32-NEXT: shlb $2, %al
-; KNL_32-NEXT: orb %cl, %al
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kshiftlw $15, %k1, %k1
+; KNL_32-NEXT: kshiftrw $14, %k1, %k1
+; KNL_32-NEXT: korw %k1, %k0, %k0
+; KNL_32-NEXT: movw $-5, %ax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kandw %k1, %k0, %k0
+; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kshiftlw $15, %k1, %k1
+; KNL_32-NEXT: kshiftrw $13, %k1, %k1
+; KNL_32-NEXT: korw %k1, %k0, %k0
+; KNL_32-NEXT: kshiftlw $12, %k0, %k0
+; KNL_32-NEXT: kshiftrw $12, %k0, %k1
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; KNL_32-NEXT: testb $1, %al
-; KNL_32-NEXT: jne .LBB32_1
-; KNL_32-NEXT: # %bb.2: # %else
-; KNL_32-NEXT: testb $2, %al
-; KNL_32-NEXT: jne .LBB32_3
-; KNL_32-NEXT: .LBB32_4: # %else2
-; KNL_32-NEXT: testb $4, %al
-; KNL_32-NEXT: jne .LBB32_5
-; KNL_32-NEXT: .LBB32_6: # %else4
-; KNL_32-NEXT: popl %eax
-; KNL_32-NEXT: .cfi_def_cfa_offset 4
-; KNL_32-NEXT: retl
-; KNL_32-NEXT: .LBB32_1: # %cond.store
-; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: vmovd %xmm0, %ecx
-; KNL_32-NEXT: vmovss %xmm2, (%ecx)
-; KNL_32-NEXT: testb $2, %al
-; KNL_32-NEXT: je .LBB32_4
-; KNL_32-NEXT: .LBB32_3: # %cond.store1
-; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
-; KNL_32-NEXT: vextractps $1, %xmm2, (%ecx)
-; KNL_32-NEXT: testb $4, %al
-; KNL_32-NEXT: je .LBB32_6
-; KNL_32-NEXT: .LBB32_5: # %cond.store3
-; KNL_32-NEXT: vpextrd $2, %xmm0, %eax
-; KNL_32-NEXT: vextractps $2, %xmm2, (%eax)
-; KNL_32-NEXT: popl %eax
-; KNL_32-NEXT: .cfi_def_cfa_offset 4
+; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test30b:
; SKX: # %bb.0:
-; SKX-NEXT: andb $1, %dil
-; SKX-NEXT: andb $1, %sil
-; SKX-NEXT: addb %sil, %sil
-; SKX-NEXT: orb %dil, %sil
-; SKX-NEXT: andb $1, %dl
-; SKX-NEXT: shlb $2, %dl
-; SKX-NEXT: orb %sil, %dl
+; SKX-NEXT: movb $-3, %al
+; SKX-NEXT: kmovw %eax, %k0
+; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftrb $7, %k1, %k1
+; SKX-NEXT: kandw %k0, %k1, %k0
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftrb $6, %k1, %k1
+; SKX-NEXT: korw %k1, %k0, %k0
+; SKX-NEXT: movb $-5, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: kmovw %edx, %k1
+; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftrb $5, %k1, %k1
+; SKX-NEXT: korw %k1, %k0, %k1
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; SKX-NEXT: testb $1, %dl
-; SKX-NEXT: jne .LBB32_1
-; SKX-NEXT: # %bb.2: # %else
-; SKX-NEXT: testb $2, %dl
-; SKX-NEXT: jne .LBB32_3
-; SKX-NEXT: .LBB32_4: # %else2
-; SKX-NEXT: testb $4, %dl
-; SKX-NEXT: jne .LBB32_5
-; SKX-NEXT: .LBB32_6: # %else4
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: .LBB32_1: # %cond.store
-; SKX-NEXT: vmovq %xmm0, %rax
-; SKX-NEXT: vmovss %xmm2, (%rax)
-; SKX-NEXT: testb $2, %dl
-; SKX-NEXT: je .LBB32_4
-; SKX-NEXT: .LBB32_3: # %cond.store1
-; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vextractps $1, %xmm2, (%rax)
-; SKX-NEXT: testb $4, %dl
-; SKX-NEXT: je .LBB32_6
-; SKX-NEXT: .LBB32_5: # %cond.store3
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vmovq %xmm0, %rax
-; SKX-NEXT: vextractps $2, %xmm2, (%rax)
+; SKX-NEXT: vpscatterqd %xmm2, (,%ymm0) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test30b:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: pushl %eax
-; SKX_32-NEXT: .cfi_def_cfa_offset 8
+; SKX_32-NEXT: movb $-3, %al
+; SKX_32-NEXT: kmovw %eax, %k0
+; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kshiftlb $7, %k1, %k1
+; SKX_32-NEXT: kshiftrb $7, %k1, %k1
+; SKX_32-NEXT: kandw %k0, %k1, %k0
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; SKX_32-NEXT: andb $1, %al
-; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %cl
-; SKX_32-NEXT: andb $1, %cl
-; SKX_32-NEXT: addb %cl, %cl
-; SKX_32-NEXT: orb %al, %cl
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kshiftlb $7, %k1, %k1
+; SKX_32-NEXT: kshiftrb $6, %k1, %k1
+; SKX_32-NEXT: korw %k1, %k0, %k0
+; SKX_32-NEXT: movb $-5, %al
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kandw %k1, %k0, %k0
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; SKX_32-NEXT: andb $1, %al
-; SKX_32-NEXT: shlb $2, %al
-; SKX_32-NEXT: orb %cl, %al
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kshiftlb $7, %k1, %k1
+; SKX_32-NEXT: kshiftrb $5, %k1, %k1
+; SKX_32-NEXT: korw %k1, %k0, %k1
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; SKX_32-NEXT: testb $1, %al
-; SKX_32-NEXT: jne .LBB32_1
-; SKX_32-NEXT: # %bb.2: # %else
-; SKX_32-NEXT: testb $2, %al
-; SKX_32-NEXT: jne .LBB32_3
-; SKX_32-NEXT: .LBB32_4: # %else2
-; SKX_32-NEXT: testb $4, %al
-; SKX_32-NEXT: jne .LBB32_5
-; SKX_32-NEXT: .LBB32_6: # %else4
-; SKX_32-NEXT: popl %eax
-; SKX_32-NEXT: .cfi_def_cfa_offset 4
-; SKX_32-NEXT: retl
-; SKX_32-NEXT: .LBB32_1: # %cond.store
-; SKX_32-NEXT: .cfi_def_cfa_offset 8
-; SKX_32-NEXT: vmovd %xmm0, %ecx
-; SKX_32-NEXT: vmovss %xmm2, (%ecx)
-; SKX_32-NEXT: testb $2, %al
-; SKX_32-NEXT: je .LBB32_4
-; SKX_32-NEXT: .LBB32_3: # %cond.store1
-; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
-; SKX_32-NEXT: vextractps $1, %xmm2, (%ecx)
-; SKX_32-NEXT: testb $4, %al
-; SKX_32-NEXT: je .LBB32_6
-; SKX_32-NEXT: .LBB32_5: # %cond.store3
-; SKX_32-NEXT: vpextrd $2, %xmm0, %eax
-; SKX_32-NEXT: vextractps $2, %xmm2, (%eax)
-; SKX_32-NEXT: popl %eax
-; SKX_32-NEXT: .cfi_def_cfa_offset 4
+; SKX_32-NEXT: vpscatterdd %xmm2, (,%xmm0) {%k1}
; SKX_32-NEXT: retl
%sext_ind = sext <3 x i32> %ind to <3 x i64>
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
More information about the llvm-commits
mailing list