[llvm] 352df10 - [X86][AVX] matchShuffleAsBlend - use isElementEquivalent to help match broadcast/repeated elements
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 22 07:26:53 PDT 2021
Author: Simon Pilgrim
Date: 2021-08-22T15:26:17+01:00
New Revision: 352df10a238f7a2abbfa55a2acd5a43fcff80410
URL: https://github.com/llvm/llvm-project/commit/352df10a238f7a2abbfa55a2acd5a43fcff80410
DIFF: https://github.com/llvm/llvm-project/commit/352df10a238f7a2abbfa55a2acd5a43fcff80410.diff
LOG: [X86][AVX] matchShuffleAsBlend - use isElementEquivalent to help match broadcast/repeated elements
Extend matchShuffleAsBlend to not only match against known in-place elements for BLEND shuffles, but use isElementEquivalent to determine if the shuffle mask's referenced element is the same as the in-place element.
This allows us to replace a number of insertps instructions with more general blendps instructions (better opportunities for commutation, concatenation etc.).
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx.ll
llvm/test/CodeGen/X86/horizontal-sum.ll
llvm/test/CodeGen/X86/sse41.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 89508b04388f..53753be52946 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12248,10 +12248,15 @@ static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
- if (M == i)
+ if (M == i ||
+ (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
+ Mask[i] = i;
continue;
- if (M == i + Size) {
+ }
+ if (M == (i + Size) ||
+ (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
BlendMask |= 1ull << i;
+ Mask[i] = i + Size;
continue;
}
if (Zeroable[i]) {
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index cc6f839e5d5c..7aa25855b96e 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -138,18 +138,17 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
ret <4 x float> %7
}
-;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
; X86-LABEL: insertps_from_broadcast_multiple_use:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
-; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
+; X86-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
+; X86-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: retl
@@ -157,11 +156,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X64-LABEL: insertps_from_broadcast_multiple_use:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4
-; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
+; X64-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 99a6e0e2b553..5c16cfa387d5 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -39,7 +39,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
+; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: retq
@@ -58,7 +58,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
-; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: retq
@@ -227,7 +227,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
+; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -248,7 +248,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
+; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -271,7 +271,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -292,7 +292,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
+; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index d15e157dc5d2..22f428b0604d 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
+; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
+; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
+; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
@@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X86-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X86-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X86-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X86-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
-; X86-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
+; X86-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
@@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
+; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
+; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
+; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
@@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X64-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X64-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X64-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X64-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
-; X64-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
+; X64-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%1 = getelementptr inbounds float, float* %fb, i64 %index
More information about the llvm-commits
mailing list