[PATCH] D105390: [X86] Lower insertions into upper half of an 256-bit vector as broadcast+blend (PR50971)
Roman Lebedev via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 6 15:09:43 PDT 2021
lebedev.ri added inline comments.
================
Comment at: llvm/test/CodeGen/X86/avx512-insert-extract.ll:653
define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
-; CHECK-LABEL: insert_v16i16:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; KNL-LABEL: insert_v16i16:
+; KNL: ## %bb.0:
----------------
lebedev.ri wrote:
> craig.topper wrote:
> > Is this really better? I assume this what we get for an AVX2 target too. Not just KNL?
> Multi-insert case does seem questionable, yes. We could improve this via:
> ```
> define <16 x i16> @src(<16 x i16> %x, i16 %y, i16* %ptr) {
> %val = load i16, i16* %ptr
> %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
> %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
> ret <16 x i16> %r2
> }
> define <16 x i16> @tgt(<16 x i16> %x, i16 %y, i16* %ptr) {
> %val = load i16, i16* %ptr
> %r1 = insertelement <16 x i16> undef, i16 %val, i32 1
> %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
> %r3 = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %x, <16 x i16> %r2
> ret <16 x i16> %r3
> }
> ```
> then we get
> ```
> .text
> .file "test.ll"
> .globl src # -- Begin function src
> .p2align 4, 0x90
> .type src, at function
> src: # @src
> .cfi_startproc
> # %bb.0:
> vpbroadcastw (%rsi), %xmm1
> vpblendw $2, %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
> vmovd %edi, %xmm2
> vpbroadcastw %xmm2, %ymm2
> vpblendw $2, %ymm2, %ymm0, %ymm0 # ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
> vpblendd $240, %ymm0, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
> retq
> .Lfunc_end0:
> .size src, .Lfunc_end0-src
> .cfi_endproc
> # -- End function
> .globl tgt # -- Begin function tgt
> .p2align 4, 0x90
> .type tgt, at function
> tgt: # @tgt
> .cfi_startproc
> # %bb.0:
> vpbroadcastw (%rsi), %xmm1
> vmovd %edi, %xmm2
> vpslld $16, %xmm2, %xmm2
> vinserti128 $1, %xmm2, %ymm1, %ymm1
> vpblendw $2, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
> retq
> .Lfunc_end1:
> .size tgt, .Lfunc_end1-tgt
> .cfi_endproc
> # -- End function
> .section ".note.GNU-stack","", at progbits
> ```
... something like D105514, but clearly that is also not as straight-forward.
Thoughts?
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D105390/new/
https://reviews.llvm.org/D105390
More information about the llvm-commits
mailing list