[llvm] [X86] splitAndLowerShuffle - split a v8f32 bitcast from v8i32 operands as 2 v4i32 shuffles (PR #143493)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 10 02:02:32 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
AVX1 performs v8i32 shuffles as bitcast v8f32, but if we split these back to v4f32 instead of peeking through the bitcasts, we can lose track of the original domain.
Fixes an issue I noticed working on #<!-- -->142972 where we were using v4f32 blends instead of v8i16 resulting in a lot of domain crossing.
Its also helps avoid unnecessary use of VINSERTPS nodes which can be tricky to commute or concatenate back to 256-bit vectors.
---
Patch is 242.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143493.diff
11 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+11)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll (+10-10)
- (modified) llvm/test/CodeGen/X86/oddshuffles.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll (+159-159)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll (+656-834)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll (+671-725)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll (+14-14)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll (+25-24)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll (+36-36)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll (+19-19)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b34215b316128..a983de7019bd9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15101,6 +15101,17 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+ // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
+ if (VT == MVT::v8f32) {
+ SDValue BC1 = peekThroughBitcasts(V1);
+ SDValue BC2 = peekThroughBitcasts(V2);
+ if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
+ if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
+ DAG, SimpleOnly))
+ return DAG.getBitcast(VT, Split);
+ }
+ }
+
ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index f5802150d5353..dc723eb713c28 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3467,9 +3467,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX-NEXT: vbroadcastss (%rdi), %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
@@ -3664,13 +3664,13 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vbroadcastss (%rdi), %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index edc8404993996..6b9a86343ea10 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1688,16 +1688,16 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX1-NEXT: vmovups (%rsi), %xmm2
; AVX1-NEXT: vmovups 16(%rsi), %xmm3
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm1[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm0[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
@@ -1808,16 +1808,16 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; XOP-NEXT: vmovups (%rsi), %xmm3
; XOP-NEXT: vmovups 16(%rsi), %xmm4
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3]
-; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
-; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
+; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2]
+; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1]
-; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
-; XOP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,2,2]
+; XOP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2],xmm4[3]
+; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; XOP-NEXT: vbroadcastsd (%rcx), %ymm3
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index 7303f6124afcb..acf9bad81736d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -243,19 +243,19 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vmovaps (%rsi), %xmm1
; AVX-NEXT: vmovaps (%rdx), %xmm2
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
-; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0],xmm4[3]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX-NEXT: vmovaps %xmm0, 32(%rcx)
-; AVX-NEXT: vmovaps %ymm3, (%rcx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm4, 16(%rcx)
+; AVX-NEXT: vmovaps %xmm3, (%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: store_i32_stride3_vf4:
@@ -458,20 +458,20 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX-LABEL: store_i32_stride3_vf8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm1
-; AVX-NEXT: vmovaps (%rdi), %xmm2
-; AVX-NEXT: vmovaps 16(%rdi), %xmm3
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1],xmm4[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovaps 16(%rdi), %xmm1
+; AVX-NEXT: vmovaps (%rsi), %xmm2
+; AVX-NEXT: vmovaps 16(%rsi), %xmm3
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX-NEXT: vbroadcastsd (%rdx), %ymm2
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3],xmm1[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm3[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,3]
@@ -779,39 +779,39 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX-LABEL: store_i32_stride3_vf16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm1
-; AVX-NEXT: vmovaps 32(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps (%rdi), %xmm4
-; AVX-NEXT: vmovaps 16(%rdi), %xmm5
-; AVX-NEXT: vmovaps 32(%rdi), %xmm6
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm0[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1],xmm7[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,1]
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovaps 16(%rdi), %xmm1
+; AVX-NEXT: vmovaps 32(%rdi), %xmm2
+; AVX-NEXT: vmovaps (%rsi), %xmm3
+; AVX-NEXT: vmovaps 16(%rsi), %xmm4
+; AVX-NEXT: vmovaps 32(%rsi), %xmm5
+; AVX-NEXT: vmovaps 48(%rsi), %xmm6
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm3[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2],xmm7[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX-NEXT: vbroadcastsd (%rdx), %ymm4
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
-; AVX-NEXT: vmovaps 48(%rdi), %xmm4
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm4[3,3],xmm3[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,2]
+; AVX-NEXT: vbroadcastsd (%rdx), %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
+; AVX-NEXT: vmovaps 48(%rdi), %xmm3
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm3[3,3],xmm6[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm2[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1],xmm4[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm4
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3],xmm1[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm5[0,2]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm2[2],xmm6[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm5
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm1[3,3],xmm4[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7]
@@ -1375,74 +1375,74 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX-LABEL: store_i32_stride3_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm3
-; AVX-NEXT: vmovaps 32(%rsi), %xmm6
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovaps 16(%rdi), %xmm3
+; AVX-NEXT: vmovaps 32(%rdi), %xmm6
+; AVX-NEXT: vmovaps (%rsi), %xmm1
+; AVX-NEXT: vmovaps 16(%rsi), %xmm4
+; AVX-NEXT: vmovaps 32(%rsi), %xmm7
; AVX-NEXT: vmovaps 48(%rsi), %xmm5
-; AVX-NEXT: vmovaps (%rdi), %xmm1
-; AVX-NEXT: vmovaps 16(%rdi), %xmm4
-; AVX-NEXT: vmovaps 32(%rdi), %xmm7
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm2[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vbroadcastsd (%rdx), %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX-NEXT: vmovaps 80(%rsi), %xmm1
; AVX-NEXT: vmovaps 80(%rdi), %xmm2
; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm2[3,3],xmm1[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX-NEXT: vmovaps 64(%rsi), %xmm2
-; AVX-NEXT: vmovaps 64(%rdi), %xmm8
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm2[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm2[1,1],xmm9[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm8[2,1]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm2
+; AVX-NEXT: vmovaps 64(%rsi), %xmm8
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm8[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2],xmm9[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
; AVX-NEXT: vbroadcastsd 64(%rdx), %ymm8
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4],ymm8[5],ymm2[6,7]
; AVX-NEXT: vmovaps 48(%rdi), %xmm8
; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm8[3,3],xmm5[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6],ymm8[7]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1],xmm8[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[2,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2],xmm8[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm7
; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
; AVX-NEXT: vmovaps 112(%rsi), %xmm7
; AVX-NEXT: vmovaps 112(%rdi), %xmm8
; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm8[3,3],xmm7[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
-; AVX-NEXT: vmovaps 96(%rsi), %xmm8
-; AVX-NEXT: vmovaps 96(%rdi), %xmm9
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm9[1],xmm8[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1],xmm10[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,0],xmm9[2,1]
+; AVX-NEXT: vmovaps 96(%rdi), %xmm8
+; AVX-NEXT: vmovaps 96(%rsi), %xmm9
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2],xmm10[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8
; AVX-NEXT: vbroadcastsd 96(%rdx), %ymm9
; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm4[3,3],xmm3[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm3[3,3],xmm4[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
@@ -2526,52 +2526,52 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-LABEL: store_i32_stride3_vf64:
; AVX: # %bb.0:
; AVX-NEXT: subq $168, %rsp
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm1
-; AVX-NEXT: vmovaps 32(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps (%rdi), %xmm4
-; AVX-NEXT: vmovaps 16(%rdi), %xmm5
-; AVX-NEXT: vmovaps 32(%rdi), %xmm6
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm0[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1],xmm7[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/143493
More information about the llvm-commits
mailing list