[llvm] refacotred shuffle (PR #188194)

Tue Mar 24 01:14:04 PDT 2026

https://github.com/mahesh-attarde created https://github.com/llvm/llvm-project/pull/188194

None

>From 3bd9180a93ab979edd47fe885a8e54398687d5e5 Mon Sep 17 00:00:00 2001
From: mattarde <mattarde at intel.com>
Date: Tue, 24 Mar 2026 01:10:24 -0700
Subject: [PATCH] refacotred shuffle

---
 llvm/lib/Target/X86/CMakeLists.txt            |   1 +
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp | 238 +++++++++++
 llvm/lib/Target/X86/GISel/X86LegalizerInfo.h  |   3 +
 llvm/lib/Target/X86/X86InstrGISel.td          |  84 ++++
 llvm/lib/Target/X86/X86ShuffleMatch.cpp       | 400 ++++++++++++++++++
 llvm/lib/Target/X86/X86ShuffleMatch.h         |  64 +++
 llvm/test/CodeGen/X86/gisel-shuffle-basic.ll  | 145 +++++++
 llvm/test/CodeGen/X86/isel-shuffle_1.ll       |  67 +++
 llvm/test/CodeGen/X86/shuffle-lower-part.ll   |  91 ++++
 9 files changed, 1093 insertions(+)
 create mode 100644 llvm/lib/Target/X86/X86ShuffleMatch.cpp
 create mode 100644 llvm/lib/Target/X86/X86ShuffleMatch.h
 create mode 100644 llvm/test/CodeGen/X86/gisel-shuffle-basic.ll
 create mode 100644 llvm/test/CodeGen/X86/isel-shuffle_1.ll
 create mode 100644 llvm/test/CodeGen/X86/shuffle-lower-part.ll

diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index c2dd34efb844d..56e4f6311d644 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -80,6 +80,7 @@ set(sources
   X86ReturnThunks.cpp
   X86SelectionDAGInfo.cpp
   X86ShuffleDecodeConstantPool.cpp
+  X86ShuffleMatch.cpp
   X86SpeculativeLoadHardening.cpp
   X86SpeculativeExecutionSideEffectSuppression.cpp
   X86Subtarget.cpp
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 41496937f61b1..87506032b93eb 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86LegalizerInfo.h"
+#include "X86ShuffleMatch.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
@@ -573,6 +574,15 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
                          {v8s64, v2s64},
                          {v8s64, v4s64}});
 
+  // G_SHUFFLE_VECTOR - Custom legalization to lower to X86-specific shuffles
+  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
+      .customFor(HasSSE1, {{v16s8, v16s8}, {v8s16, v8s16}, {v4s32, v4s32},
+                           {v2s64, v2s64}, {v4s32, v4s32}, {v2s64, v2s64}})
+      .customFor(HasAVX, {{v32s8, v32s8}, {v16s16, v16s16}, {v8s32, v8s32},
+                          {v4s64, v4s64}, {v8s32, v8s32}, {v4s64, v4s64}})
+      .customFor(HasAVX512, {{v64s8, v64s8}, {v32s16, v32s16}, {v16s32, v16s32},
+                             {v8s64, v8s64}, {v16s32, v16s32}, {v8s64, v8s64}});
+
   // todo: vectors and address spaces
   getActionDefinitionsBuilder(G_SELECT)
       .legalFor({{s16, s32}, {s32, s32}, {p0, s32}})
@@ -613,6 +623,8 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
     return false;
   case TargetOpcode::G_BUILD_VECTOR:
     return legalizeBuildVector(MI, MRI, Helper);
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return legalizeShuffleVector(MI, MRI, Helper);
   case TargetOpcode::G_FPTOUI:
     return legalizeFPTOUI(MI, MRI, Helper);
   case TargetOpcode::G_UITOFP:
@@ -997,6 +1009,232 @@ bool X86LegalizerInfo::legalizeSETROUNDING(MachineInstr &MI,
   return true;
 }
 
+bool X86LegalizerInfo::legalizeShuffleVector(MachineInstr &MI,
+                                             MachineRegisterInfo &MRI,
+                                             LegalizerHelper &Helper) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineFunction &MF = MIRBuilder.getMF();
+  const auto &ShuffleVec = cast<GShuffleVector>(MI);
+
+  Register Dst = ShuffleVec.getReg(0);
+  Register Src1 = ShuffleVec.getSrc1Reg();
+  Register Src2 = ShuffleVec.getSrc2Reg();
+  ArrayRef<int> Mask = ShuffleVec.getMask();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src1);
+
+  unsigned NumElts = DstTy.getNumElements();
+  unsigned EltSize = DstTy.getScalarSizeInBits();
+  unsigned VecSize = DstTy.getSizeInBits();
+  unsigned NumSrcElts = SrcTy.getNumElements();
+
+  bool Src2IsUndef = getOpcodeDef<GImplicitDef>(Src2, MRI) != nullptr;
+  bool SingleSource = Src2IsUndef || (Src1 == Src2);
+
+  // 1. Try BROADCAST pattern first (simplest)
+  if (X86::isBroadcastMask(Mask)) {
+    // Find the broadcast source index
+    int BroadcastIdx = -1;
+    for (int M : Mask) {
+      if (M >= 0) {
+        BroadcastIdx = M;
+        break;
+      }
+    }
+
+    if (BroadcastIdx >= 0 && Subtarget.hasAVX()) {
+      Register SrcVec = ((unsigned)BroadcastIdx < NumSrcElts) ? Src1 : Src2;
+      unsigned Idx = ((unsigned)BroadcastIdx < NumSrcElts)
+                       ? BroadcastIdx
+                       : (BroadcastIdx - NumSrcElts);
+
+      // Extract the element
+      auto ExtractTy = LLT::scalar(EltSize);
+      auto Extract = MIRBuilder.buildExtractVectorElement(ExtractTy, SrcVec,
+                                                          MIRBuilder.buildConstant(LLT::scalar(64), Idx));
+
+      // Broadcast it
+      MIRBuilder.buildInstr(X86::G_X86_VBROADCAST, {Dst}, {Extract});
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // 2. Try SHUFPS/SHUFPD pattern (common for float shuffles)
+  if ((EltSize == 32 || EltSize == 64) && VecSize >= 128) {
+    unsigned Imm = 0;
+    bool Swap = false;
+
+    if (X86::matchShufpMask(Mask, NumElts, NumSrcElts, EltSize, SingleSource,
+                            Imm, Swap)) {
+      Register OpA = Swap ? Src2 : Src1;
+      Register OpB = SingleSource ? Src1 : (Swap ? Src1 : Src2);
+
+      MIRBuilder.buildInstr(X86::G_X86_SHUFP, {Dst}, {OpA, OpB}).addImm(Imm);
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // 3. Try UNPCKL/UNPCKH patterns (interleaving)
+  if (VecSize >= 128) {
+    bool Swap = false;
+    if (X86::matchUnpackLowMask(Mask, NumElts, NumSrcElts, EltSize, Swap)) {
+      Register OpA = Swap ? Src2 : Src1;
+      Register OpB = SingleSource ? Src1 : (Swap ? Src1 : Src2);
+
+      MIRBuilder.buildInstr(X86::G_X86_UNPCKL, {Dst}, {OpA, OpB});
+      MI.eraseFromParent();
+      return true;
+    }
+
+    if (X86::matchUnpackHighMask(Mask, NumElts, NumSrcElts, EltSize, Swap)) {
+      Register OpA = Swap ? Src2 : Src1;
+      Register OpB = SingleSource ? Src1 : (Swap ? Src1 : Src2);
+
+      MIRBuilder.buildInstr(X86::G_X86_UNPCKH, {Dst}, {OpA, OpB});
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // 4. Try PSHUFD pattern (32-bit element in-lane shuffles)
+  if (SingleSource && EltSize == 32 && Subtarget.hasSSE2()) {
+    unsigned Imm = 0;
+    if (X86::matchPshufdMask(Mask, NumElts, Imm)) {
+      MIRBuilder.buildInstr(X86::G_X86_PSHUFD, {Dst}, {Src1}).addImm(Imm);
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // 5. Try VPERMILPS/VPERMILPD (AVX in-lane permutes)
+  if (SingleSource && Subtarget.hasAVX() && (EltSize == 32 || EltSize == 64)) {
+    int Imm = 0;
+    if (X86::matchVPermilMask(Mask, NumElts, EltSize, Imm)) {
+      if (Imm >= 0) {
+        // Immediate form
+        MIRBuilder.buildInstr(X86::G_X86_VPERMILPI, {Dst}, {Src1}).addImm(Imm);
+      } else {
+        // Variable form - need to build mask vector
+        // For now, skip variable form (can be added later)
+        return false;
+      }
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // 6. Try VPERMQ/VPERMPD (AVX2 256-bit cross-lane)
+  if (SingleSource && Subtarget.hasAVX2() && VecSize == 256 && EltSize == 64) {
+    unsigned Imm = 0;
+    if (X86::matchVPermiMask(Mask, NumElts, Imm)) {
+      MIRBuilder.buildInstr(X86::G_X86_VPERMI, {Dst}, {Src1}).addImm(Imm);
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // 7. Try PSHUFB (byte-level shuffle with SSSE3+)
+  if (SingleSource && Subtarget.hasSSSE3() && VecSize >= 128) {
+    if (X86::matchPshufbMask(Mask, NumElts, EltSize)) {
+      // Build the PSHUFB control mask
+      unsigned NumBytes = VecSize / 8;
+      unsigned BytesPerElt = EltSize / 8;
+      SmallVector<Constant *, 64> MaskConsts;
+
+      LLVMContext &Ctx = MF.getFunction().getContext();
+      IntegerType *I8Ty = IntegerType::get(Ctx, 8);
+
+      for (unsigned i = 0; i < NumBytes; ++i) {
+        unsigned EltIdx = i / BytesPerElt;
+        unsigned ByteInElt = i % BytesPerElt;
+
+        int M = Mask[EltIdx];
+        if (M < 0) {
+          // Undef - use 0x80 to zero the byte
+          MaskConsts.push_back(ConstantInt::get(I8Ty, 0x80));
+        } else {
+          unsigned SrcByteIdx = M * BytesPerElt + ByteInElt;
+          MaskConsts.push_back(ConstantInt::get(I8Ty, SrcByteIdx));
+        }
+      }
+
+      Constant *MaskCV = ConstantVector::get(MaskConsts);
+      const DataLayout &DL = MIRBuilder.getDataLayout();
+      unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
+      Align Alignment(DL.getABITypeAlign(MaskCV->getType()));
+
+      auto MaskAddr = MIRBuilder.buildConstantPool(
+          LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)),
+          MF.getConstantPool()->getConstantPoolIndex(MaskCV, Alignment));
+
+      LLT MaskVecTy = LLT::fixed_vector(NumBytes, 8);
+      MachineMemOperand *MMO = MF.getMachineMemOperand(
+          MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
+          MaskVecTy, Alignment);
+
+      auto MaskReg = MIRBuilder.buildLoad(MaskVecTy, MaskAddr, *MMO);
+
+      // Bitcast source to byte vector
+      auto Src1Bytes = MIRBuilder.buildBitcast(MaskVecTy, Src1);
+
+      // Execute PSHUFB
+      auto Result = MIRBuilder.buildInstr(X86::G_X86_PSHUFB, {MaskVecTy},
+                                         {Src1Bytes, MaskReg});
+
+      // Bitcast result back
+      MIRBuilder.buildBitcast(Dst, Result);
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // 8. Try BLENDV (SSE4.1+ variable blend)
+  if (Subtarget.hasSSE41() && !SingleSource && VecSize >= 128) {
+    if (X86::matchBlendMask(Mask, NumElts, NumSrcElts)) {
+      // Build blend mask: all-ones for src2, all-zeros for src1
+      SmallVector<Constant *, 64> MaskConsts;
+
+      LLVMContext &Ctx = MF.getFunction().getContext();
+      IntegerType *EltTy = IntegerType::get(Ctx, EltSize);
+
+      for (unsigned i = 0; i < NumElts; ++i) {
+        if (Mask[i] < 0 || Mask[i] == (int)i) {
+          // Select from src1 (sign bit = 0)
+          MaskConsts.push_back(Constant::getNullValue(EltTy));
+        } else {
+          // Select from src2 (sign bit = 1)
+          MaskConsts.push_back(Constant::getAllOnesValue(EltTy));
+        }
+      }
+
+      Constant *MaskCV = ConstantVector::get(MaskConsts);
+      const DataLayout &DL = MIRBuilder.getDataLayout();
+      unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
+      Align Alignment(DL.getABITypeAlign(MaskCV->getType()));
+
+      auto MaskAddr = MIRBuilder.buildConstantPool(
+          LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)),
+          MF.getConstantPool()->getConstantPoolIndex(MaskCV, Alignment));
+
+      MachineMemOperand *MMO = MF.getMachineMemOperand(
+          MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
+          DstTy, Alignment);
+
+      auto MaskReg = MIRBuilder.buildLoad(DstTy, MaskAddr, *MMO);
+
+      MIRBuilder.buildInstr(X86::G_X86_BLENDV, {Dst}, {Src1, Src2, MaskReg});
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // If no pattern matched, let it fall through to default lowering
+  return false;
+}
+
 bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                          MachineInstr &MI) const {
   return true;
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
index 09c727c8e8685..b422009c5a9b7 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
@@ -60,6 +60,9 @@ class X86LegalizerInfo : public LegalizerInfo {
 
   bool legalizeSETROUNDING(MachineInstr &MI, MachineRegisterInfo &MRI,
                            LegalizerHelper &Helper) const;
+
+  bool legalizeShuffleVector(MachineInstr &MI, MachineRegisterInfo &MRI,
+                             LegalizerHelper &Helper) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/X86/X86InstrGISel.td b/llvm/lib/Target/X86/X86InstrGISel.td
index b0c6bb6f61ad8..f5dd47bc250c9 100644
--- a/llvm/lib/Target/X86/X86InstrGISel.td
+++ b/llvm/lib/Target/X86/X86InstrGISel.td
@@ -41,7 +41,91 @@ def G_FLDCW16 : X86GenericInstruction {
   let mayLoad = true;
 }
 
+// X86-specific shuffle operations 
+
+// PSHUFB - Packed Shuffle Bytes (SSSE3+)
+// Byte-level shuffle using control mask from second operand
+def G_X86_PSHUFB : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, type0:$mask);
+  let hasSideEffects = false;
+}
+
+// PSHUFD/PSHUFLW/PSHUFHW - Shuffle doublewords/low words/high words
+// In-lane shuffle with immediate control
+def G_X86_PSHUFD : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, untyped_imm_0:$imm);
+  let hasSideEffects = false;
+}
+
+// SHUFPS/SHUFPD - Shuffle packed single/double precision floats
+def G_X86_SHUFP : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2, untyped_imm_0:$imm);
+  let hasSideEffects = false;
+}
+
+// UNPCKL/UNPCKH - Unpack low/high data
+def G_X86_UNPCKL : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = false;
+}
+
+def G_X86_UNPCKH : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = false;
+}
+
+// VBROADCAST - Broadcast a single element to all elements
+def G_X86_VBROADCAST : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = false;
+}
+
+// VPERMILPS/VPERMILPD - Permute in-lane with variable control (AVX+)
+def G_X86_VPERMILPV : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, type0:$mask);
+  let hasSideEffects = false;
+}
+
+// VPERMILPS/VPERMILPD - Permute in-lane with immediate control (AVX+)
+def G_X86_VPERMILPI : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, untyped_imm_0:$imm);
+  let hasSideEffects = false;
+}
+
+// VPERMQ/VPERMPD - Permute 256-bit lanes (AVX2+)
+def G_X86_VPERMI : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, untyped_imm_0:$imm);
+  let hasSideEffects = false;
+}
+
+// BLENDVPS/BLENDVPD/PBLENDVB - Variable blend using mask register (SSE4.1+)
+def G_X86_BLENDV : X86GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2, type0:$mask);
+  let hasSideEffects = false;
+}
+
 def : GINodeEquiv<G_FILD, X86fild>;
 def : GINodeEquiv<G_FIST, X86fp_to_mem>;
 def : GINodeEquiv<G_FNSTCW16, X86fp_cwd_get16>;
 def : GINodeEquiv<G_FLDCW16, X86fp_cwd_set16>;
+
+// X86 Shuffle Node Equivalences
+def : GINodeEquiv<G_X86_PSHUFB, X86pshufb>;
+def : GINodeEquiv<G_X86_SHUFP, X86Shufp>;
+def : GINodeEquiv<G_X86_UNPCKL, X86Unpckl>;
+def : GINodeEquiv<G_X86_UNPCKH, X86Unpckh>;
+def : GINodeEquiv<G_X86_VBROADCAST, X86VBroadcast>;
+def : GINodeEquiv<G_X86_VPERMILPV, X86VPermilpv>;
+def : GINodeEquiv<G_X86_VPERMILPI, X86VPermilpi>;
+def : GINodeEquiv<G_X86_VPERMI, X86VPermi>;
+def : GINodeEquiv<G_X86_BLENDV, X86Blendv>;
diff --git a/llvm/lib/Target/X86/X86ShuffleMatch.cpp b/llvm/lib/Target/X86/X86ShuffleMatch.cpp
new file mode 100644
index 0000000000000..0f93e6eb4d358
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ShuffleMatch.cpp
@@ -0,0 +1,400 @@
+//===-- X86ShuffleMatch.cpp - X86 Shuffle Pattern Matching ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements shared shuffle pattern matching functions that can be
+// used by both SelectionDAG and GlobalISel lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleMatch.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+/// Compute the SHUFPS/SHUFPD immediate encoding for a per-lane shuffle mask.
+static unsigned getShufpImm(ArrayRef<int> LaneMask, unsigned EltsPerLane) {
+  unsigned BitsPerIdx = (EltsPerLane == 4) ? 2 : 1;
+  unsigned HalfLane = EltsPerLane / 2;
+  unsigned Imm = 0;
+  for (unsigned i = 0; i < EltsPerLane; ++i) {
+    int M = LaneMask[i];
+    unsigned Val;
+    if (M < 0)
+      Val = (i < HalfLane) ? i : (i - HalfLane);
+    else
+      Val = M;
+    Imm |= (Val & ((1u << BitsPerIdx) - 1)) << (i * BitsPerIdx);
+  }
+  return Imm;
+}
+
+bool X86::matchShufpMask(ArrayRef<int> Mask, unsigned NumElts,
+                         unsigned NumSrcElts, unsigned EltSize,
+                         bool SingleSource, unsigned &Imm, bool &Swap) {
+  unsigned EltsPerLane = 128 / EltSize; // 4 for 32-bit, 2 for 64-bit
+  unsigned NumLanes = NumElts / EltsPerLane;
+  unsigned HalfLane = EltsPerLane / 2;
+
+  // SHUFPS/SHUFPD only work for 32-bit or 64-bit elements
+  if (EltSize != 32 && EltSize != 64)
+    return false;
+
+  for (int Attempt = 0; Attempt < 2; ++Attempt) {
+    bool TrySwap = (Attempt == 1);
+    if (TrySwap && SingleSource)
+      break;
+
+    bool Valid = true;
+    unsigned FirstLaneImm = 0;
+
+    for (unsigned Lane = 0; Lane < NumLanes && Valid; ++Lane) {
+      unsigned LaneStart = Lane * EltsPerLane;
+      SmallVector<int, 4> LaneMask(EltsPerLane);
+
+      for (unsigned i = 0; i < EltsPerLane && Valid; ++i) {
+        int M = Mask[LaneStart + i];
+
+        if (M < 0) {
+          LaneMask[i] = -1;
+          continue;
+        }
+
+        bool FromSrc2 = ((unsigned)M >= NumSrcElts);
+        unsigned SrcIdx = FromSrc2 ? (M - NumSrcElts) : M;
+
+        // Must reference the same lane in the source
+        if (SrcIdx / EltsPerLane != Lane) {
+          Valid = false;
+          break;
+        }
+        unsigned SrcLaneOff = SrcIdx % EltsPerLane;
+
+        if (i < HalfLane) {
+          // Low half: should come from the first operand
+          bool WantSrc2 = TrySwap;
+          if (!SingleSource && FromSrc2 != WantSrc2) {
+            Valid = false;
+            break;
+          }
+        } else {
+          // High half: should come from the second operand
+          bool WantSrc2 = !TrySwap;
+          if (!SingleSource && FromSrc2 != WantSrc2) {
+            Valid = false;
+            break;
+          }
+        }
+
+        LaneMask[i] = SrcLaneOff;
+      }
+
+      if (!Valid)
+        break;
+
+      unsigned CurImm = getShufpImm(LaneMask, EltsPerLane);
+      if (Lane == 0)
+        FirstLaneImm = CurImm;
+      else if (CurImm != FirstLaneImm)
+        Valid = false;
+    }
+
+    if (Valid) {
+      Imm = FirstLaneImm;
+      Swap = TrySwap;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86::isBroadcastMask(ArrayRef<int> Mask) {
+  if (Mask.empty())
+    return false;
+
+  int BroadcastIdx = -1;
+  for (int M : Mask) {
+    if (M < 0) // undef
+      continue;
+    if (BroadcastIdx < 0)
+      BroadcastIdx = M;
+    else if (M != BroadcastIdx)
+      return false;
+  }
+  return BroadcastIdx >= 0;
+}
+
+bool X86::matchBlendMask(ArrayRef<int> Mask, unsigned NumElts,
+                         unsigned NumSrcElts) {
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (Mask[i] < 0)
+      continue; // undef is fine
+    if (Mask[i] != (int)i && Mask[i] != (int)(i + NumSrcElts))
+      return false;
+  }
+  return true;
+}
+
+bool X86::matchUnpackLowMask(ArrayRef<int> Mask, unsigned NumElts,
+                             unsigned NumSrcElts, unsigned EltSize,
+                             bool &Swap) {
+  unsigned EltsPerLane = 128 / EltSize;
+  unsigned NumLanes = NumElts / EltsPerLane;
+  unsigned HalfLane = EltsPerLane / 2;
+
+  for (int Attempt = 0; Attempt < 2; ++Attempt) {
+    bool TrySwap = (Attempt == 1);
+    bool Valid = true;
+
+    for (unsigned Lane = 0; Lane < NumLanes && Valid; ++Lane) {
+      unsigned LaneStart = Lane * EltsPerLane;
+      unsigned SrcLaneBase = Lane * EltsPerLane;
+
+      for (unsigned i = 0; i < EltsPerLane && Valid; ++i) {
+        int M = Mask[LaneStart + i];
+        if (M < 0)
+          continue;
+
+        bool FromSrc2 = ((unsigned)M >= NumSrcElts);
+        unsigned SrcIdx = FromSrc2 ? (M - NumSrcElts) : M;
+
+        // Determine expected source and index for UNPCKL
+        bool ExpectSrc2;
+        unsigned ExpectIdx;
+        if ((i % 2) == 0) {
+          // Even positions: low half of first source
+          ExpectSrc2 = TrySwap;
+          ExpectIdx = SrcLaneBase + (i / 2);
+        } else {
+          // Odd positions: low half of second source
+          ExpectSrc2 = !TrySwap;
+          ExpectIdx = SrcLaneBase + (i / 2);
+        }
+
+        if (FromSrc2 != ExpectSrc2 || SrcIdx != ExpectIdx) {
+          Valid = false;
+          break;
+        }
+      }
+    }
+
+    if (Valid) {
+      Swap = TrySwap;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86::matchUnpackHighMask(ArrayRef<int> Mask, unsigned NumElts,
+                              unsigned NumSrcElts, unsigned EltSize,
+                              bool &Swap) {
+  unsigned EltsPerLane = 128 / EltSize;
+  unsigned NumLanes = NumElts / EltsPerLane;
+  unsigned HalfLane = EltsPerLane / 2;
+
+  for (int Attempt = 0; Attempt < 2; ++Attempt) {
+    bool TrySwap = (Attempt == 1);
+    bool Valid = true;
+
+    for (unsigned Lane = 0; Lane < NumLanes && Valid; ++Lane) {
+      unsigned LaneStart = Lane * EltsPerLane;
+      unsigned SrcLaneBase = Lane * EltsPerLane;
+
+      for (unsigned i = 0; i < EltsPerLane && Valid; ++i) {
+        int M = Mask[LaneStart + i];
+        if (M < 0)
+          continue;
+
+        bool FromSrc2 = ((unsigned)M >= NumSrcElts);
+        unsigned SrcIdx = FromSrc2 ? (M - NumSrcElts) : M;
+
+        // Determine expected source and index for UNPCKH
+        bool ExpectSrc2;
+        unsigned ExpectIdx;
+        if ((i % 2) == 0) {
+          // Even positions: high half of first source
+          ExpectSrc2 = TrySwap;
+          ExpectIdx = SrcLaneBase + HalfLane + (i / 2);
+        } else {
+          // Odd positions: high half of second source
+          ExpectSrc2 = !TrySwap;
+          ExpectIdx = SrcLaneBase + HalfLane + (i / 2);
+        }
+
+        if (FromSrc2 != ExpectSrc2 || SrcIdx != ExpectIdx) {
+          Valid = false;
+          break;
+        }
+      }
+    }
+
+    if (Valid) {
+      Swap = TrySwap;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86::matchPshufdMask(ArrayRef<int> Mask, unsigned NumElts, unsigned &Imm) {
+  // PSHUFD works on 4 doublewords (128-bit lane)
+  if (NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  unsigned LaneSize = 4; // 4 x i32 per 128-bit lane
+  unsigned NumLanes = NumElts / LaneSize;
+
+  unsigned FirstLaneImm = 0;
+  for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+    unsigned LaneStart = Lane * LaneSize;
+    unsigned SrcLaneBase = Lane * LaneSize;
+    unsigned LaneImm = 0;
+
+    for (unsigned i = 0; i < LaneSize; ++i) {
+      int M = Mask[LaneStart + i];
+      if (M < 0)
+        M = i; // Treat undef as identity
+
+      // Must reference same lane
+      if ((unsigned)M < SrcLaneBase || (unsigned)M >= SrcLaneBase + LaneSize)
+        return false;
+
+      unsigned Idx = M - SrcLaneBase;
+      LaneImm |= (Idx << (i * 2));
+    }
+
+    if (Lane == 0)
+      FirstLaneImm = LaneImm;
+    else if (LaneImm != FirstLaneImm)
+      return false; // All lanes must have same pattern
+  }
+
+  Imm = FirstLaneImm;
+  return true;
+}
+
+bool X86::matchPshufbMask(ArrayRef<int> Mask, unsigned NumElts,
+                          unsigned EltSize) {
+  // PSHUFB works at byte granularity
+  if (EltSize != 8)
+    return false;
+
+  // Check that no shuffle crosses 128-bit lane boundaries
+  for (unsigned i = 0; i < NumElts; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue; // undef is OK
+
+    unsigned DstLane = i / 16;
+    unsigned SrcLane = M / 16;
+    if (DstLane != SrcLane)
+      return false;
+  }
+
+  return true;
+}
+
+bool X86::matchVPermilMask(ArrayRef<int> Mask, unsigned NumElts,
+                           unsigned EltSize, int &Imm) {
+  // VPERMILPS works on 4 elements per lane, VPERMILPD on 2 elements per lane
+  if (EltSize != 32 && EltSize != 64)
+    return false;
+
+  unsigned EltsPerLane = 128 / EltSize;
+  unsigned NumLanes = NumElts / EltsPerLane;
+
+  // Check if this is an immediate form (same pattern in all lanes)
+  SmallVector<int, 4> FirstLaneMask;
+  for (unsigned i = 0; i < EltsPerLane; ++i) {
+    FirstLaneMask.push_back(Mask[i] >= 0 ? (Mask[i] % EltsPerLane) : -1);
+  }
+
+  bool IsImmediate = true;
+  for (unsigned Lane = 1; Lane < NumLanes; ++Lane) {
+    unsigned LaneStart = Lane * EltsPerLane;
+    for (unsigned i = 0; i < EltsPerLane; ++i) {
+      int M = Mask[LaneStart + i];
+      int ExpectedM = FirstLaneMask[i];
+
+      if (M < 0 && ExpectedM < 0)
+        continue;
+      if (M >= 0 && ExpectedM >= 0) {
+        unsigned MLane = M % EltsPerLane;
+        if (MLane != (unsigned)ExpectedM) {
+          IsImmediate = false;
+          break;
+        }
+      } else {
+        IsImmediate = false;
+        break;
+      }
+    }
+    if (!IsImmediate)
+      break;
+  }
+
+  // Check all references stay in-lane
+  for (unsigned i = 0; i < NumElts; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+
+    unsigned DstLane = i / EltsPerLane;
+    unsigned SrcLane = M / EltsPerLane;
+    if (DstLane != SrcLane)
+      return false;
+  }
+
+  if (IsImmediate && EltsPerLane == 4) {
+    // Compute VPERMILPS immediate
+    unsigned ImmVal = 0;
+    for (unsigned i = 0; i < 4; ++i) {
+      int M = FirstLaneMask[i];
+      if (M < 0)
+        M = i;
+      ImmVal |= (M & 3) << (i * 2);
+    }
+    Imm = ImmVal;
+  } else if (IsImmediate && EltsPerLane == 2) {
+    // Compute VPERMILPD immediate
+    unsigned ImmVal = 0;
+    for (unsigned i = 0; i < 2; ++i) {
+      int M = FirstLaneMask[i];
+      if (M < 0)
+        M = i;
+      ImmVal |= (M & 1) << i;
+    }
+    Imm = ImmVal;
+  } else {
+    Imm = -1; // Variable mask form
+  }
+
+  return true;
+}
+
+bool X86::matchVPermiMask(ArrayRef<int> Mask, unsigned NumElts, unsigned &Imm) {
+  // VPERMQ/VPERMPD - 256-bit cross-lane permute (AVX2)
+  // Works on 4 x 64-bit elements
+  if (NumElts != 4)
+    return false;
+
+  unsigned ImmVal = 0;
+  for (unsigned i = 0; i < 4; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      M = i; // Treat undef as identity
+
+    if (M < 0 || M >= 4)
+      return false;
+
+    ImmVal |= (M & 3) << (i * 2);
+  }
+
+  Imm = ImmVal;
+  return true;
+}
diff --git a/llvm/lib/Target/X86/X86ShuffleMatch.h b/llvm/lib/Target/X86/X86ShuffleMatch.h
new file mode 100644
index 0000000000000..98881d68e9c96
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ShuffleMatch.h
@@ -0,0 +1,64 @@
+//===-- X86ShuffleMatch.h - X86 Shuffle Pattern Matching --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines shared shuffle pattern matching functions that can be used
+// by both SelectionDAG and GlobalISel lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEMATCH_H
+#define LLVM_LIB_TARGET_X86_X86SHUFFLEMATCH_H
+
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+namespace X86 {
+
+/// Check if a shuffle mask matches a SHUFPS/SHUFPD pattern.
+/// Returns true if the mask can be implemented with SHUFP, filling in Imm
+/// with the immediate value and Swap with whether operands should be swapped.
+bool matchShufpMask(ArrayRef<int> Mask, unsigned NumElts, unsigned NumSrcElts,
+                    unsigned EltSize, bool SingleSource, unsigned &Imm,
+                    bool &Swap);
+
+/// Check if a shuffle mask is a simple broadcast pattern.
+bool isBroadcastMask(ArrayRef<int> Mask);
+
+/// Check if a shuffle mask matches a blend pattern where each element comes
+/// from either src1[i] or src2[i].
+bool matchBlendMask(ArrayRef<int> Mask, unsigned NumElts, unsigned NumSrcElts);
+
+/// Check if a shuffle mask matches UNPCKL pattern.
+bool matchUnpackLowMask(ArrayRef<int> Mask, unsigned NumElts,
+                        unsigned NumSrcElts, unsigned EltSize, bool &Swap);
+
+/// Check if a shuffle mask matches UNPCKH pattern.
+bool matchUnpackHighMask(ArrayRef<int> Mask, unsigned NumElts,
+                         unsigned NumSrcElts, unsigned EltSize, bool &Swap);
+
+/// Check if a shuffle mask can be implemented with PSHUFD.
+/// Returns true and fills in Imm if successful.
+bool matchPshufdMask(ArrayRef<int> Mask, unsigned NumElts, unsigned &Imm);
+
+/// Check if a shuffle mask can be implemented with PSHUFB.
+/// Returns true if the mask is byte-aligned and doesn't cross 128-bit lanes.
+bool matchPshufbMask(ArrayRef<int> Mask, unsigned NumElts, unsigned EltSize);
+
+/// Check if shuffle mask can be implemented as VPERMILPS/VPERMILPD.
+/// Returns true and fills in Imm for immediate form, or returns true with
+/// Imm=-1 for variable mask form.
+bool matchVPermilMask(ArrayRef<int> Mask, unsigned NumElts, unsigned EltSize,
+                      int &Imm);
+
+/// Check if shuffle mask can be implemented as VPERMQ/VPERMPD (AVX2 cross-lane).
+bool matchVPermiMask(ArrayRef<int> Mask, unsigned NumElts, unsigned &Imm);
+
+} // namespace X86
+} // namespace llvm
+
+#endif
diff --git a/llvm/test/CodeGen/X86/gisel-shuffle-basic.ll b/llvm/test/CodeGen/X86/gisel-shuffle-basic.ll
new file mode 100644
index 0000000000000..dbaf7c171e074
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gisel-shuffle-basic.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=AVX2
+
+; Test basic shuffle patterns in GlobalISel
+
+; SHUFPS pattern
+define <4 x float> @test_shufps(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: test_shufps:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_shufps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %res = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %res
+}
+
+; UNPCKL pattern
+define <4 x i32> @test_unpckl(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_unpckl:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_unpckl:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %res = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %res
+}
+
+; UNPCKH pattern
+define <4 x i32> @test_unpckh(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_unpckh:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_unpckh:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
+  %res = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %res
+}
+
+; PSHUFD pattern
+define <4 x i32> @test_pshufd(<4 x i32> %a) {
+; SSE2-LABEL: test_pshufd:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_pshufd:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT:    retq
+  %res = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %res
+}
+
+; Broadcast pattern
+define <4 x i32> @test_broadcast(<4 x i32> %a) {
+; SSE2-LABEL: test_broadcast:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_broadcast:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX-NEXT:    retq
+  %res = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %res
+}
+
+; BLENDV pattern
+define <4 x i32> @test_blendv(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_blendv:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_blendv:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pblendvb {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_blendv:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpblendvb {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    retq
+  %res = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+; VPERMILPS pattern (AVX in-lane permute)
+define <4 x float> @test_vpermilps(<4 x float> %a) {
+; SSE2-LABEL: test_vpermilps:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_vpermilps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %res = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x float> %res
+}
+
+; VPERMQ pattern (AVX2 cross-lane)
+define <4 x i64> @test_vpermq(<4 x i64> %a) {
+; AVX-LABEL: test_vpermq:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_vpermq:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,0,3,2]
+; AVX2-NEXT:    retq
+  %res = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 2>
+  ret <4 x i64> %res
+}
+
+; 256-bit SHUFPS
+define <8 x float> @test_shufps_256(<8 x float> %a, <8 x float> %b) {
+; AVX-LABEL: test_shufps_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5]
+; AVX-NEXT:    retq
+  %res = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
+  ret <8 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/isel-shuffle_1.ll b/llvm/test/CodeGen/X86/isel-shuffle_1.ll
new file mode 100644
index 0000000000000..1ffdc33486dee
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-shuffle_1.ll
@@ -0,0 +1,67 @@
+; Function Attrs: nounwind
+define <2 x double> @broadcast_v2f64(<2 x double> %a, <2 x double> %b) #0 {
+  %s = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %s
+}
+
+; Function Attrs: nounwind
+; define <2 x double> @broadcast_v2f64_from_second(<2 x double> %a, <2 x double> %b) #0 {
+;   %s = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
+;   ret <2 x double> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <4 x float> @broadcast_v4f32(<4 x float> %a, <4 x float> %b) #0 {
+;   %s = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
+;   ret <4 x float> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <4 x float> @broadcast_v4f32_lane1(<4 x float> %a, <4 x float> %b) #0 {
+;   %s = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+;   ret <4 x float> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <4 x float> @broadcast_v4f32_from_second(<4 x float> %a, <4 x float> %b) #0 {
+;   %s = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+;   ret <4 x float> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <8 x i16> @broadcast_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
+;   %s = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
+;   ret <8 x i16> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <8 x i16> @broadcast_v8i16_lane3(<8 x i16> %a, <8 x i16> %b) #0 {
+;   %s = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+;   ret <8 x i16> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <16 x i8> @broadcast_v16i8(<16 x i8> %a, <16 x i8> %b) #0 {
+;   %s = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
+;   ret <16 x i8> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <16 x i8> @broadcast_v16i8_lane7(<16 x i8> %a, <16 x i8> %b) #0 {
+;   %s = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+;   ret <16 x i8> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <2 x i64> @broadcast_v2i64(<2 x i64> %a, <2 x i64> %b) #0 {
+;   %s = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
+;   ret <2 x i64> %s
+; }
+
+; ; Function Attrs: nounwind
+; define <2 x i64> @broadcast_v2i64_lane1(<2 x i64> %a, <2 x i64> %b) #0 {
+;   %s = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+;   ret <2 x i64> %s
+; }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/shuffle-lower-part.ll b/llvm/test/CodeGen/X86/shuffle-lower-part.ll
new file mode 100644
index 0000000000000..41d3b3b328c0f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shuffle-lower-part.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+
+; Test shufflevector operations that extract only the lower part of vectors
+
+define <2 x i32> @shuffle_v4i32_lower_half(<4 x i32> %a) {
+  %result = shufflevector <4 x i32> %a, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %result
+}
+
+define <4 x i32> @shuffle_v8i32_lower_half(<8 x i32> %a) {
+  %result = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <2 x i64> @shuffle_v4i64_lower_half(<4 x i64> %a) {
+  %result = shufflevector <4 x i64> %a, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %result
+}
+
+define <4 x i64> @shuffle_v8i64_lower_half(<8 x i64> %a) {
+  %result = shufflevector <8 x i64> %a, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %result
+}
+
+define <2 x float> @shuffle_v4f32_lower_half(<4 x float> %a) {
+  %result = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %result
+}
+
+define <4 x float> @shuffle_v8f32_lower_half(<8 x float> %a) {
+  %result = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <2 x double> @shuffle_v4f64_lower_half(<4 x double> %a) {
+  %result = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %result
+}
+
+define <4 x double> @shuffle_v8f64_lower_half(<8 x double> %a) {
+  %result = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x i16> @shuffle_v8i16_lower_half(<8 x i16> %a) {
+  %result = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %result
+}
+
+define <8 x i16> @shuffle_v16i16_lower_half(<16 x i16> %a) {
+  %result = shufflevector <16 x i16> %a, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+define <8 x i8> @shuffle_v16i8_lower_half(<16 x i8> %a) {
+  %result = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %result
+}
+
+define <16 x i8> @shuffle_v32i8_lower_half(<32 x i8> %a) {
+  %result = shufflevector <32 x i8> %a, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %result
+}
+
+; Test extracting lower quarter (first 1/4 of elements)
+define <2 x i32> @shuffle_v8i32_lower_quarter(<8 x i32> %a) {
+  %result = shufflevector <8 x i32> %a, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %result
+}
+
+; Test with undef instead of poison
+define <2 x i32> @shuffle_v4i32_lower_half_undef(<4 x i32> %a) {
+  %result = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %result
+}
+
+; Test extracting just the first element as a single-element vector
+define <1 x i64> @shuffle_v2i64_first_element(<2 x i64> %a) {
+  %result = shufflevector <2 x i64> %a, <2 x i64> poison, <1 x i32> <i32 0>
+  ret <1 x i64> %result
+}
+
+; Test with 3 out of 4 elements from lower part
+define <3 x i32> @shuffle_v4i32_lower_three(<4 x i32> %a) {
+  %result = shufflevector <4 x i32> %a, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i32> %result
+}