[llvm] [VectorCombine] Add foldShuffleOfIntrinsics. (PR #106502)

Tue Sep 10 02:28:58 PDT 2024

https://github.com/HanKuanChen updated https://github.com/llvm/llvm-project/pull/106502

>From d8f1ab5bb1382d186a1778fe15bd141f970ebac3 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Wed, 28 Aug 2024 23:53:01 -0700
Subject: [PATCH 1/7] [VectorCombine] Pre-commit test.

---
 .../VectorCombine/foldShuffleOfIntrinsics.ll  | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll

diff --git a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll b/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
new file mode 100644
index 00000000000000..ba45f24929c71e
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=vector-combine -S %s | FileCheck %s
+
+define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
+;
+entry:
+  %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+  %3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+  %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %4
+}
+
+define <8 x i32> @test2(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
+;
+entry:
+  %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 true)
+  %3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+  %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %4
+}
+
+declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)

>From 874963226704580f69773745b4356c6227f652d9 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Wed, 28 Aug 2024 23:53:32 -0700
Subject: [PATCH 2/7] [VectorCombine] Add foldShuffleOfIntrinsics.

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 94 +++++++++++++++++++
 .../VectorCombine/foldShuffleOfIntrinsics.ll  |  7 +-
 2 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 99bd383ab0dead..9ad7276ffb9707 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -115,6 +115,7 @@ class VectorCombine {
   bool foldShuffleOfBinops(Instruction &I);
   bool foldShuffleOfCastops(Instruction &I);
   bool foldShuffleOfShuffles(Instruction &I);
+  bool foldShuffleOfIntrinsics(Instruction &I);
   bool foldShuffleToIdentity(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
   bool foldCastFromReductions(Instruction &I);
@@ -1673,6 +1674,98 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
   return true;
 }
 
+/// Try to convert
+/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
+bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
+  Value *V0, *V1;
+  ArrayRef<int> OldMask;
+  if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
+                           m_Mask(OldMask))))
+    return false;
+
+  auto *II0 = dyn_cast<IntrinsicInst>(V0);
+  auto *II1 = dyn_cast<IntrinsicInst>(V1);
+  if (!II0 || !II1)
+    return false;
+
+  Intrinsic::ID IID = II0->getIntrinsicID();
+  if (IID != II1->getIntrinsicID())
+    return false;
+
+  auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
+  auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
+  if (!ShuffleDstTy || !II0Ty)
+    return false;
+
+  switch (IID) {
+  case Intrinsic::abs: {
+    if (cast<Constant>(II0->getArgOperand(1))->isOneValue() !=
+        cast<Constant>(II1->getArgOperand(1))->isOneValue())
+      return false;
+    break;
+  }
+  default:
+    return false;
+  }
+
+  SmallVector<Value *> Args0;
+  SmallVector<Value *> Args1;
+  for (unsigned I = 0; I != II0->arg_size(); ++I) {
+    Args0.push_back(II0->getArgOperand(I));
+    Args1.push_back(II1->getArgOperand(I));
+  }
+  IntrinsicCostAttributes Attr0(IID, II0Ty, Args0);
+  IntrinsicCostAttributes Attr1(IID, II1->getType(), Args1);
+  InstructionCost OldCost =
+      TTI.getIntrinsicInstrCost(Attr0, TTI::TCK_RecipThroughput) +
+      TTI.getIntrinsicInstrCost(Attr1, TTI::TCK_RecipThroughput) +
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask,
+                         TTI::TCK_RecipThroughput, 0, nullptr, {II0, II1}, &I);
+
+  InstructionCost NewCost;
+  switch (IID) {
+  case Intrinsic::abs: {
+    IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy,
+                                    {ShuffleDstTy, Builder.getInt1Ty()});
+    NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty,
+                                 OldMask, TTI::TCK_RecipThroughput) +
+              TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput);
+    break;
+  }
+  default:
+    llvm_unreachable("Unexpected intrinsic");
+  }
+
+  LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+
+  if (NewCost > OldCost)
+    return false;
+
+  Value *NewIntrinsic;
+  switch (IID) {
+  case Intrinsic::abs: {
+    Value *Shuf = Builder.CreateShuffleVector(Args0[0], Args1[0], OldMask);
+    NewIntrinsic = Builder.CreateIntrinsic(
+        ShuffleDstTy, IID, {Shuf, cast<Constant>(II0->getArgOperand(1))});
+    Worklist.pushValue(Shuf);
+    break;
+  }
+  default:
+    llvm_unreachable("Unexpected intrinsic");
+  }
+
+  // Intersect flags from the old intrinsics.
+  if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
+    NewInst->copyIRFlags(II0);
+    NewInst->andIRFlags(II1);
+  }
+
+  replaceValue(I, *NewIntrinsic);
+  return true;
+}
+
 using InstLane = std::pair<Use *, int>;
 
 static InstLane lookThroughShuffles(Use *U, int Lane) {
@@ -2554,6 +2647,7 @@ bool VectorCombine::run() {
         MadeChange |= foldShuffleOfBinops(I);
         MadeChange |= foldShuffleOfCastops(I);
         MadeChange |= foldShuffleOfShuffles(I);
+        MadeChange |= foldShuffleOfIntrinsics(I);
         MadeChange |= foldSelectShuffle(I);
         MadeChange |= foldShuffleToIdentity(I);
         break;
diff --git a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll b/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
index ba45f24929c71e..32a11a7d813b28 100644
--- a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
@@ -4,10 +4,9 @@
 define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 entry:
   %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)

>From da5fb67949f4dbfda52732955b0cc24fb7f344b7 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Fri, 30 Aug 2024 00:38:44 -0700
Subject: [PATCH 3/7] [VectorCombine] Pre-commit test.

---
 .../VectorCombine/foldShuffleOfIntrinsics.ll  | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll b/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
index 32a11a7d813b28..747caa6689ae78 100644
--- a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
@@ -30,4 +30,53 @@ entry:
   ret <8 x i32> %4
 }
 
+define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP6]]
+;
+entry:
+  %4 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1)
+  %5 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> %3)
+  %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %6
+}
+
+define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i1> [[TMP4]]
+;
+entry:
+  %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
+  %3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
+  %4 = shufflevector <4 x i1> %2, <4 x i1> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i1> %4
+}
+
+define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP6]]
+;
+entry:
+  %4 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %0, i32 %1)
+  %5 = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> %2, <4 x i32> %3)
+  %6 = shufflevector <4 x float> %4, <4 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %6
+}
+
 declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)
+declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
+declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>)

>From 813b95ca99cdfb42b5f93d7c7e35183eedd8f534 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Fri, 30 Aug 2024 00:56:44 -0700
Subject: [PATCH 4/7] [VectorCombine] Apply comment.

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 79 ++++++++-----------
 .../VectorCombine/foldShuffleOfIntrinsics.ll  | 13 ++-
 2 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9ad7276ffb9707..4d64bef893c9d2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1697,44 +1697,36 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   if (!ShuffleDstTy || !II0Ty)
     return false;
 
-  switch (IID) {
-  case Intrinsic::abs: {
-    if (cast<Constant>(II0->getArgOperand(1))->isOneValue() !=
-        cast<Constant>(II1->getArgOperand(1))->isOneValue())
-      return false;
-    break;
-  }
-  default:
+  if (!isTriviallyVectorizable(IID))
     return false;
-  }
 
-  SmallVector<Value *> Args0;
-  SmallVector<Value *> Args1;
-  for (unsigned I = 0; I != II0->arg_size(); ++I) {
-    Args0.push_back(II0->getArgOperand(I));
-    Args1.push_back(II1->getArgOperand(I));
-  }
-  IntrinsicCostAttributes Attr0(IID, II0Ty, Args0);
-  IntrinsicCostAttributes Attr1(IID, II1->getType(), Args1);
+  for (unsigned I = 0; I != II0->arg_size(); ++I)
+    if (isVectorIntrinsicWithScalarOpAtArg(IID, I) &&
+        II0->getArgOperand(I) != II1->getArgOperand(I))
+      return false;
+
   InstructionCost OldCost =
-      TTI.getIntrinsicInstrCost(Attr0, TTI::TCK_RecipThroughput) +
-      TTI.getIntrinsicInstrCost(Attr1, TTI::TCK_RecipThroughput) +
+      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0),
+                                TTI::TCK_RecipThroughput) +
+      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1),
+                                TTI::TCK_RecipThroughput) +
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask,
                          TTI::TCK_RecipThroughput, 0, nullptr, {II0, II1}, &I);
 
-  InstructionCost NewCost;
-  switch (IID) {
-  case Intrinsic::abs: {
-    IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy,
-                                    {ShuffleDstTy, Builder.getInt1Ty()});
-    NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty,
-                                 OldMask, TTI::TCK_RecipThroughput) +
-              TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput);
-    break;
-  }
-  default:
-    llvm_unreachable("Unexpected intrinsic");
-  }
+  SmallVector<Type *> NewArgsTy;
+  InstructionCost NewCost = 0;
+  for (unsigned I = 0; I != II0->arg_size(); ++I)
+    if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+      NewArgsTy.push_back(II0->getArgOperand(I)->getType());
+    } else {
+      auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
+      NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
+                                               VecTy->getNumElements() * 2));
+      NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+                                    VecTy, OldMask, TTI::TCK_RecipThroughput);
+    }
+  IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
+  NewCost += TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput);
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -1743,18 +1735,17 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   if (NewCost > OldCost)
     return false;
 
-  Value *NewIntrinsic;
-  switch (IID) {
-  case Intrinsic::abs: {
-    Value *Shuf = Builder.CreateShuffleVector(Args0[0], Args1[0], OldMask);
-    NewIntrinsic = Builder.CreateIntrinsic(
-        ShuffleDstTy, IID, {Shuf, cast<Constant>(II0->getArgOperand(1))});
-    Worklist.pushValue(Shuf);
-    break;
-  }
-  default:
-    llvm_unreachable("Unexpected intrinsic");
-  }
+  SmallVector<Value *> NewArgs;
+  for (unsigned I = 0; I != II0->arg_size(); ++I)
+    if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+      NewArgs.push_back(II0->getArgOperand(I));
+    } else {
+      Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
+                                                II1->getArgOperand(I), OldMask);
+      NewArgs.push_back(Shuf);
+      Worklist.pushValue(Shuf);
+    }
+  Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
 
   // Intersect flags from the old intrinsics.
   if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
diff --git a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll b/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
index 747caa6689ae78..b9c8e795ad5e73 100644
--- a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
@@ -33,9 +33,9 @@ entry:
 define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> [[TMP3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; CHECK-NEXT:    ret <8 x i32> [[TMP6]]
 ;
 entry:
@@ -48,10 +48,9 @@ entry:
 define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> [[TMP2]], i32 0)
+; CHECK-NEXT:    ret <8 x i1> [[TMP3]]
 ;
 entry:
   %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)

>From 8eeb33732a2854bef39d1eabc31d87ed15e54c21 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 2 Sep 2024 01:38:03 -0700
Subject: [PATCH 5/7] [VectorCombine] Add triple.

---
 .../{ => RISCV}/foldShuffleOfIntrinsics.ll        | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)
 rename llvm/test/Transforms/VectorCombine/{ => RISCV}/foldShuffleOfIntrinsics.ll (80%)

diff --git a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll b/llvm/test/Transforms/VectorCombine/RISCV/foldShuffleOfIntrinsics.ll
similarity index 80%
rename from llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
rename to llvm/test/Transforms/VectorCombine/RISCV/foldShuffleOfIntrinsics.ll
index b9c8e795ad5e73..7ccc14cc0b125e 100644
--- a/llvm/test/Transforms/VectorCombine/foldShuffleOfIntrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/foldShuffleOfIntrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=vector-combine -S %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mattr=+v -passes=vector-combine -S %s | FileCheck %s
 
 define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) {
 ; CHECK-LABEL: @test1(
@@ -33,9 +33,9 @@ entry:
 define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> [[TMP3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP6]]
 ;
 entry:
@@ -48,9 +48,10 @@ entry:
 define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> [[TMP2]], i32 0)
-; CHECK-NEXT:    ret <8 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i1> [[TMP4]]
 ;
 entry:
   %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)

>From e0b9bd1fbf01a0c248a9c18e2f8874fd996466b9 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Tue, 10 Sep 2024 02:07:32 -0700
Subject: [PATCH 6/7] [VectorCombine] Rename foldShuffleOfIntrinsics.ll to
 shuffle-of-intrinsics.ll.

---
 .../{foldShuffleOfIntrinsics.ll => shuffle-of-intrinsics.ll}      | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/Transforms/VectorCombine/RISCV/{foldShuffleOfIntrinsics.ll => shuffle-of-intrinsics.ll} (100%)

diff --git a/llvm/test/Transforms/VectorCombine/RISCV/foldShuffleOfIntrinsics.ll b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll
similarity index 100%
rename from llvm/test/Transforms/VectorCombine/RISCV/foldShuffleOfIntrinsics.ll
rename to llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll

>From 7acd424b29d54d898801c83026bed8325178323b Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Tue, 10 Sep 2024 02:09:06 -0700
Subject: [PATCH 7/7] [VectorCombine] Add X86/shuffle-of-intrinsics.ll.

---
 .../X86/shuffle-of-intrinsics.ll              | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll

diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
new file mode 100644
index 00000000000000..c8816e436559e8
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=AVX,CHECK-V3
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=AVX,CHECK-V4
+
+define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) {
+; SSE-LABEL: @test1(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
+; SSE-NEXT:    ret <8 x i32> [[TMP3]]
+;
+; AVX-LABEL: @test1(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+;
+entry:
+  %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+  %3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+  %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %4
+}
+
+define <8 x i32> @test2(<4 x i32> %0, <4 x i32> %1) {
+; SSE-LABEL: @test2(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 true)
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP4]]
+;
+; AVX-LABEL: @test2(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 true)
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x i32> [[TMP4]]
+;
+entry:
+  %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 true)
+  %3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+  %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %4
+}
+
+define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
+; SSE-LABEL: @test3(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]])
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP6]]
+;
+; AVX-LABEL: @test3(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> [[TMP3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    ret <8 x i32> [[TMP6]]
+;
+entry:
+  %4 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1)
+  %5 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> %3)
+  %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %6
+}
+
+define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) {
+; SSE-LABEL: @test4(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> [[TMP2]], i32 0)
+; SSE-NEXT:    ret <8 x i1> [[TMP3]]
+;
+; AVX-LABEL: @test4(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0)
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0)
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x i1> [[TMP4]]
+;
+entry:
+  %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
+  %3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
+  %4 = shufflevector <4 x i1> %2, <4 x i1> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i1> %4
+}
+
+define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
+; SSE-LABEL: @test5(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]])
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP6]]
+;
+; AVX-LABEL: @test5(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]])
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x float> [[TMP6]]
+;
+entry:
+  %4 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %0, i32 %1)
+  %5 = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> %2, <4 x i32> %3)
+  %6 = shufflevector <4 x float> %4, <4 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %6
+}
+
+declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)
+declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
+declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-V3: {{.*}}
+; CHECK-V4: {{.*}}
+; SSE2: {{.*}}
+; SSE4: {{.*}}