[llvm] [VectorCombine] Fold chain of (scalar load)->ext->ext to load->ext. (PR #141109)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 4 11:18:01 PDT 2025


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/141109

>From da7fbedd9b9d5ad2bfa45f0c0c46f1e345e5a867 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 20 May 2025 12:40:11 +0100
Subject: [PATCH 1/4] [VectorCombine] Fold chain of (scalar load)->ext->ext to
 load->ext.

Add a new combine that folds a chain of (scalar load)->ext->ext (with
shuffles/casts/inserts in between) to a single vector load and wide
extend.

This makes the IR simpler to analyze and to process, while the backend
can still decide to break them up. Code like that comes from code
written with vector intrinsics. Some examples of real-world use are in
https://github.com/ARM-software/astc-encoder/.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 51 +++++++++++++++
 .../AArch64/combine-shuffle-ext.ll            | 64 +++++--------------
 2 files changed, 67 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 52cb1dbb33b86..57ca05016b811 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -128,6 +128,7 @@ class VectorCombine {
   bool foldShuffleOfShuffles(Instruction &I);
   bool foldShuffleOfIntrinsics(Instruction &I);
   bool foldShuffleToIdentity(Instruction &I);
+  bool foldShuffleExtExtracts(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
   bool foldCastFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -2791,6 +2792,55 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
   return true;
 }
 
+bool VectorCombine::foldShuffleExtExtracts(Instruction &I) {
+  // Try to fold vector zero- and sign-extends split across multiple operations
+  // into a single extend, removing redundant inserts and shuffles.
+
+  // Check if we have an extended shuffle that selects the first vector, which
+  // itself is another extend fed by a load.
+  Instruction *L;
+  if (!match(
+          &I,
+          m_OneUse(m_Shuffle(
+              m_OneUse(m_ZExtOrSExt(m_OneUse(m_BitCast(m_OneUse(m_InsertElt(
+                  m_Value(), m_OneUse(m_Instruction(L)), m_SpecificInt(0))))))),
+              m_Value()))) ||
+      !cast<ShuffleVectorInst>(&I)->isIdentityWithExtract() ||
+      !isa<LoadInst>(L))
+    return false;
+  auto *InnerExt = cast<Instruction>(I.getOperand(0));
+  auto *OuterExt = dyn_cast<Instruction>(*I.user_begin());
+  if (!isa<SExtInst, ZExtInst>(OuterExt))
+    return false;
+
+  // If the inner extend is a sign extend and the outer one isnt (i.e. a
+  // zero-extend), don't fold. If the first one is zero-extend, it doesn't
+  // matter if the second one is a sign- or zero-extend.
+  if (isa<SExtInst>(InnerExt) && !isa<SExtInst>(OuterExt))
+    return false;
+
+  // Don't try to convert the load if it has an odd size.
+  if (!DL->typeSizeEqualsStoreSize(L->getType()))
+    return false;
+  auto *DstTy = cast<FixedVectorType>(OuterExt->getType());
+  auto *SrcTy =
+      FixedVectorType::get(InnerExt->getOperand(0)->getType()->getScalarType(),
+                           DstTy->getNumElements());
+  if (DL->getTypeStoreSize(SrcTy) != DL->getTypeStoreSize(L->getType()))
+    return false;
+
+  // Convert to a vector load feeding a single wide extend.
+  Builder.SetInsertPoint(*L->getInsertionPointAfterDef());
+  auto *NewLoad = cast<LoadInst>(
+      Builder.CreateLoad(SrcTy, L->getOperand(0), L->getName() + ".vec"));
+  auto *NewExt = isa<ZExtInst>(InnerExt) ? Builder.CreateZExt(NewLoad, DstTy)
+                                         : Builder.CreateSExt(NewLoad, DstTy);
+  OuterExt->replaceAllUsesWith(NewExt);
+  replaceValue(*OuterExt, *NewExt);
+  Worklist.pushValue(NewLoad);
+  return true;
+}
+
 /// Given a commutative reduction, the order of the input lanes does not alter
 /// the results. We can use this to remove certain shuffles feeding the
 /// reduction, removing the need to shuffle at all.
@@ -3565,6 +3615,7 @@ bool VectorCombine::run() {
         break;
       case Instruction::ShuffleVector:
         MadeChange |= widenSubvectorLoad(I);
+        MadeChange |= foldShuffleExtExtracts(I);
         break;
       default:
         break;
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
index 6341c8945247d..2d6d80ad57fb5 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
@@ -11,12 +11,8 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -33,12 +29,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -121,13 +113,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.i32(i32 0)
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -287,12 +275,8 @@ define <8 x i32> @load_i64_zext_to_v8i32(ptr %di) {
 ; CHECK-LABEL: define <8 x i32> @load_i64_zext_to_v8i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[DI]], align 8
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <16 x i8> [[VEC_BC]] to <16 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <8 x i8>, ptr [[DI]], align 8
+; CHECK-NEXT:    [[OUTER_EXT:%.*]] = zext <8 x i8> [[L_VEC]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[OUTER_EXT]]
 ;
 entry:
@@ -309,12 +293,8 @@ define <3 x i32> @load_i24_zext_to_v3i32(ptr %di) {
 ; CHECK-LABEL: define <3 x i32> @load_i24_zext_to_v3i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i24, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <6 x i8> [[VEC_BC]] to <6 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <3 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <3 x i8> [[L_VEC]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT_2]]
 ;
 entry:
@@ -419,12 +399,8 @@ define <4 x i32> @load_i32_sext_to_v4i32(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_sext_to_v4i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i8> [[L_VEC]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -441,12 +417,8 @@ define <8 x i32> @load_i64_sext_to_v8i32(ptr %di) {
 ; CHECK-LABEL: define <8 x i32> @load_i64_sext_to_v8i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[DI]], align 8
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <16 x i8> [[VEC_BC]] to <16 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[OUTER_EXT:%.*]] = sext <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <8 x i8>, ptr [[DI]], align 8
+; CHECK-NEXT:    [[OUTER_EXT:%.*]] = sext <8 x i8> [[L_VEC]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[OUTER_EXT]]
 ;
 entry:
@@ -463,12 +435,8 @@ define <3 x i32> @load_i24_sext_to_v3i32(ptr %di) {
 ; CHECK-LABEL: define <3 x i32> @load_i24_sext_to_v3i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i24, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <6 x i8> [[VEC_BC]] to <6 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <3 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <3 x i8> [[L_VEC]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT_2]]
 ;
 entry:

>From 7c9999ad7916e1fd98af16a62d6a04806edf52ce Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 23 May 2025 14:03:11 +0100
Subject: [PATCH 2/4] !fixup address comments, thanks

---
 .../Transforms/Vectorize/VectorCombine.cpp    |  6 ++---
 .../AArch64/combine-shuffle-ext.ll            | 26 +++++--------------
 2 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 57ca05016b811..b5840e751cfb5 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2809,7 +2809,7 @@ bool VectorCombine::foldShuffleExtExtracts(Instruction &I) {
       !isa<LoadInst>(L))
     return false;
   auto *InnerExt = cast<Instruction>(I.getOperand(0));
-  auto *OuterExt = dyn_cast<Instruction>(*I.user_begin());
+  auto *OuterExt = cast<Instruction>(*I.user_begin());
   if (!isa<SExtInst, ZExtInst>(OuterExt))
     return false;
 
@@ -2833,8 +2833,8 @@ bool VectorCombine::foldShuffleExtExtracts(Instruction &I) {
   Builder.SetInsertPoint(*L->getInsertionPointAfterDef());
   auto *NewLoad = cast<LoadInst>(
       Builder.CreateLoad(SrcTy, L->getOperand(0), L->getName() + ".vec"));
-  auto *NewExt = isa<ZExtInst>(InnerExt) ? Builder.CreateZExt(NewLoad, DstTy)
-                                         : Builder.CreateSExt(NewLoad, DstTy);
+  auto *NewExt = isa<ZExtInst>(InnerExt) ? Builder.CreateZExt(NewLoad, DstTy, "vec.ext", InnerExt->hasNonNeg())
+                                         : Builder.CreateSExt(NewLoad, DstTy, "vec.ext");
   OuterExt->replaceAllUsesWith(NewExt);
   replaceValue(*OuterExt, *NewExt);
   Worklist.pushValue(NewLoad);
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
index 2d6d80ad57fb5..55a38d8a5307c 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
@@ -30,7 +30,7 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) {
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[L_VEC]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -47,12 +47,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[L_VEC]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -69,12 +65,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_outer_nneg(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_outer_nneg(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -91,12 +83,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg_outer_sext(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg_outer_sext(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[L_VEC]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:

>From 7030a9f097581ea8b7d8298eea9a9daf75a09f2c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 23 May 2025 14:17:18 +0100
Subject: [PATCH 3/4] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b5840e751cfb5..18f2077df1f57 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2833,8 +2833,10 @@ bool VectorCombine::foldShuffleExtExtracts(Instruction &I) {
   Builder.SetInsertPoint(*L->getInsertionPointAfterDef());
   auto *NewLoad = cast<LoadInst>(
       Builder.CreateLoad(SrcTy, L->getOperand(0), L->getName() + ".vec"));
-  auto *NewExt = isa<ZExtInst>(InnerExt) ? Builder.CreateZExt(NewLoad, DstTy, "vec.ext", InnerExt->hasNonNeg())
-                                         : Builder.CreateSExt(NewLoad, DstTy, "vec.ext");
+  auto *NewExt =
+      isa<ZExtInst>(InnerExt)
+          ? Builder.CreateZExt(NewLoad, DstTy, "vec.ext", InnerExt->hasNonNeg())
+          : Builder.CreateSExt(NewLoad, DstTy, "vec.ext");
   OuterExt->replaceAllUsesWith(NewExt);
   replaceValue(*OuterExt, *NewExt);
   Worklist.pushValue(NewLoad);

>From 7df373e04c243162ddcaea3653a6569887aa8c7b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 4 Jun 2025 18:54:31 +0100
Subject: [PATCH 4/4] !fixup just fold shuffle/ext

---
 .../Transforms/Vectorize/VectorCombine.cpp    |  59 +++++-----
 .../AArch64/combine-shuffle-ext.ll            | 102 +++++++++++-------
 2 files changed, 96 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 18f2077df1f57..1356ff9e56adb 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -128,7 +128,7 @@ class VectorCombine {
   bool foldShuffleOfShuffles(Instruction &I);
   bool foldShuffleOfIntrinsics(Instruction &I);
   bool foldShuffleToIdentity(Instruction &I);
-  bool foldShuffleExtExtracts(Instruction &I);
+  bool foldShuffleExt(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
   bool foldCastFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -2792,21 +2792,17 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
   return true;
 }
 
-bool VectorCombine::foldShuffleExtExtracts(Instruction &I) {
+bool VectorCombine::foldShuffleExt(Instruction &I) {
   // Try to fold vector zero- and sign-extends split across multiple operations
-  // into a single extend, removing redundant inserts and shuffles.
-
-  // Check if we have an extended shuffle that selects the first vector, which
-  // itself is another extend fed by a load.
-  Instruction *L;
-  if (!match(
-          &I,
-          m_OneUse(m_Shuffle(
-              m_OneUse(m_ZExtOrSExt(m_OneUse(m_BitCast(m_OneUse(m_InsertElt(
-                  m_Value(), m_OneUse(m_Instruction(L)), m_SpecificInt(0))))))),
-              m_Value()))) ||
-      !cast<ShuffleVectorInst>(&I)->isIdentityWithExtract() ||
-      !isa<LoadInst>(L))
+  // into a single extend.
+
+  // Check if we have ZEXT/SEXT (SHUFFLE (ZEXT/SEXT %src), _, identity-mask),
+  // with an identity mask extracting the first sub-vector.
+  Value *Src;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_OneUse(m_Shuffle(m_OneUse(m_ZExtOrSExt(m_Value(Src))),
+                                    m_Value(), m_Mask(Mask)))) ||
+      !cast<ShuffleVectorInst>(&I)->isIdentityWithExtract())
     return false;
   auto *InnerExt = cast<Instruction>(I.getOperand(0));
   auto *OuterExt = cast<Instruction>(*I.user_begin());
@@ -2819,27 +2815,34 @@ bool VectorCombine::foldShuffleExtExtracts(Instruction &I) {
   if (isa<SExtInst>(InnerExt) && !isa<SExtInst>(OuterExt))
     return false;
 
-  // Don't try to convert the load if it has an odd size.
-  if (!DL->typeSizeEqualsStoreSize(L->getType()))
-    return false;
   auto *DstTy = cast<FixedVectorType>(OuterExt->getType());
   auto *SrcTy =
       FixedVectorType::get(InnerExt->getOperand(0)->getType()->getScalarType(),
                            DstTy->getNumElements());
-  if (DL->getTypeStoreSize(SrcTy) != DL->getTypeStoreSize(L->getType()))
-    return false;
 
-  // Convert to a vector load feeding a single wide extend.
-  Builder.SetInsertPoint(*L->getInsertionPointAfterDef());
-  auto *NewLoad = cast<LoadInst>(
-      Builder.CreateLoad(SrcTy, L->getOperand(0), L->getName() + ".vec"));
+  // Don't perform the fold if the cost of the new extend is worse than the cost
+  // of the 2 original extends.
+  InstructionCost OriginalCost =
+      TTI.getCastInstrCost(InnerExt->getOpcode(), SrcTy, InnerExt->getType(),
+                           TTI::CastContextHint::None) +
+      TTI.getCastInstrCost(InnerExt->getOpcode(), SrcTy, InnerExt->getType(),
+                           TTI::CastContextHint::None);
+  InstructionCost NewCost = TTI.getCastInstrCost(
+      InnerExt->getOpcode(), SrcTy, DstTy, TTI::CastContextHint::None);
+  if (NewCost > OriginalCost)
+    return false;
+
+  // Convert to a shuffle of the input feeding a single wide extend.
+  Builder.SetInsertPoint(*OuterExt->getInsertionPointAfterDef());
+  auto *NewIns =
+      Builder.CreateShuffleVector(Src, PoisonValue::get(Src->getType()), Mask);
   auto *NewExt =
       isa<ZExtInst>(InnerExt)
-          ? Builder.CreateZExt(NewLoad, DstTy, "vec.ext", InnerExt->hasNonNeg())
-          : Builder.CreateSExt(NewLoad, DstTy, "vec.ext");
+          ? Builder.CreateZExt(NewIns, DstTy, "vec.ext", InnerExt->hasNonNeg())
+          : Builder.CreateSExt(NewIns, DstTy, "vec.ext");
   OuterExt->replaceAllUsesWith(NewExt);
   replaceValue(*OuterExt, *NewExt);
-  Worklist.pushValue(NewLoad);
+  Worklist.pushValue(NewExt);
   return true;
 }
 
@@ -3617,7 +3620,7 @@ bool VectorCombine::run() {
         break;
       case Instruction::ShuffleVector:
         MadeChange |= widenSubvectorLoad(I);
-        MadeChange |= foldShuffleExtExtracts(I);
+        MadeChange |= foldShuffleExt(I);
         break;
       default:
         break;
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
index 55a38d8a5307c..9ac3655f3e59d 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
@@ -11,8 +11,11 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -29,8 +32,11 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[L_VEC]] to <4 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -47,8 +53,11 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[L_VEC]] to <4 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -65,8 +74,11 @@ define <4 x i32> @load_i32_zext_to_v4i32_outer_nneg(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_outer_nneg(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -83,8 +95,11 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg_outer_sext(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg_outer_sext(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[L_VEC]] to <4 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -101,9 +116,12 @@ define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    call void @use.i32(i32 0)
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -146,9 +164,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_load_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.i32(i32 [[L]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -170,9 +187,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_ins_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.v2i32(<2 x i32> [[VEC_INS]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -194,9 +210,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_bc_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.v8i8(<8 x i8> [[VEC_BC]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -263,8 +278,11 @@ define <8 x i32> @load_i64_zext_to_v8i32(ptr %di) {
 ; CHECK-LABEL: define <8 x i32> @load_i64_zext_to_v8i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <8 x i8>, ptr [[DI]], align 8
-; CHECK-NEXT:    [[OUTER_EXT:%.*]] = zext <8 x i8> [[L_VEC]] to <8 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[DI]], align 8
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[OUTER_EXT:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[OUTER_EXT]]
 ;
 entry:
@@ -281,8 +299,11 @@ define <3 x i32> @load_i24_zext_to_v3i32(ptr %di) {
 ; CHECK-LABEL: define <3 x i32> @load_i24_zext_to_v3i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <3 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <3 x i8> [[L_VEC]] to <3 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i24, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <3 x i8> [[TMP0]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT_2]]
 ;
 entry:
@@ -302,9 +323,8 @@ define <4 x i32> @load_i32_insert_idx_1_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[L]], i64 1
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -387,8 +407,11 @@ define <4 x i32> @load_i32_sext_to_v4i32(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_sext_to_v4i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i8> [[L_VEC]] to <4 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -405,8 +428,11 @@ define <8 x i32> @load_i64_sext_to_v8i32(ptr %di) {
 ; CHECK-LABEL: define <8 x i32> @load_i64_sext_to_v8i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <8 x i8>, ptr [[DI]], align 8
-; CHECK-NEXT:    [[OUTER_EXT:%.*]] = sext <8 x i8> [[L_VEC]] to <8 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[DI]], align 8
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[OUTER_EXT:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[OUTER_EXT]]
 ;
 entry:
@@ -423,8 +449,11 @@ define <3 x i32> @load_i24_sext_to_v3i32(ptr %di) {
 ; CHECK-LABEL: define <3 x i32> @load_i24_sext_to_v3i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L_VEC:%.*]] = load <3 x i8>, ptr [[DI]], align 4
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <3 x i8> [[L_VEC]] to <3 x i32>
+; CHECK-NEXT:    [[L:%.*]] = load i24, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <3 x i8> [[TMP0]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT_2]]
 ;
 entry:
@@ -444,9 +473,8 @@ define <4 x i32> @load_i32_insert_idx_1(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[L]], i64 1
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:



More information about the llvm-commits mailing list