[llvm] [AMDGPU] Identify vector idiom to unlock SROA (PR #156791)

Sun Sep 14 19:23:58 PDT 2025

https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/156791

>From c4a9bf261fed0f0a392b243241c5cc5d2adda72f Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Fri, 12 Sep 2025 10:47:40 -0400
Subject: [PATCH] [AMDGPU] Identify vector idiom to unlock SROA

HIP vector types often lower to aggregates and get copied with memcpy.
When the source or destination is chosen via a pointer select, SROA
cannot split the aggregate. This keeps data in stack slots and increases
scratch traffic. By rewriting these memcpy idioms, we enable SROA to
promote values, reducing stack usage and improving occupancy and
bandwidth on AMD GPUs.

For example:

%p = select i1 %cond, ptr %A, ptr %B

call void @llvm.memcpy.p0.p0.i32(ptr %dst, ptr %p, i32 16, i1 false)

When the source is a pointer select and conditions allow, the pass
replaces the memcpy with two aligned loads, a value-level select of the
loaded vector, and one aligned store. If it is not safe to speculate
both loads, it splits control flow and emits a memcpy in each arm. When
the destination is a select, it always splits control flow to avoid
speculative stores. Vector element types are chosen based on size and
minimum proven alignment to minimize the number of operations.

The pass handles non-volatile, constant-length memcpy up to a small size
cap. Source and destination must be in the same address space. It runs
early, after inlining and before InferAddressSpaces and SROA. Volatile
and cross-address-space memcpys are skipped.

The size cap is controlled by -amdgpu-vector-idiom-max-bytes (default
32), allowing tuning for different workloads.

With contribution from Ron Lieberman.

Fixes: SWDEV-550134
---
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   2 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   9 +
 llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp  | 487 ++++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h    |  43 ++
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../amdgpu-vector-idiom-memcpy-select.ll      | 424 +++++++++++++++
 6 files changed, 966 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 9449e70930913..1e730218722b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,8 @@ FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
 FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
+FUNCTION_PASS("amdgpu-vector-idiom",
+              AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32))
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
 #undef FUNCTION_PASS
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b5771b6..1249e25114e1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -34,6 +34,7 @@
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUUnifyDivergentExitNodes.h"
+#include "AMDGPUVectorIdiom.h"
 #include "AMDGPUWaitSGPRHazards.h"
 #include "GCNDPPCombine.h"
 #include "GCNIterativeScheduler.h"
@@ -905,6 +906,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
             EnablePromoteKernelArguments)
           FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
 
+        // Run vector-idiom canonicalization early (after inlining) and before
+        // infer-AS / SROA to maximize scalarization opportunities.
+        // Specify 32 bytes since the largest HIP vector types are double4 or
+        // long4.
+        FPM.addPass(AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32));
+
         // Add infer address spaces pass to the opt pipeline after inlining
         // but before SROA to increase SROA opportunities.
         FPM.addPass(InferAddressSpacesPass());
@@ -953,6 +960,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         if (EnableLowerModuleLDS)
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
         if (Level != OptimizationLevel::O0) {
+          PM.addPass(createModuleToFunctionPassAdaptor(
+              AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32)));
           // We only want to run this with O2 or higher since inliner and SROA
           // don't run in O1.
           if (Level != OptimizationLevel::O1) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp
new file mode 100644
index 0000000000000..b1194f8282b8e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp
@@ -0,0 +1,487 @@
+//===- AMDGPUVectorIdiom.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AMDGPU-specific vector idiom canonicalizations to unblock SROA and
+// subsequent scalarization/vectorization.
+//
+// Motivation:
+// - HIP vector types are often modeled as structs and copied with memcpy.
+//   Address-level selects on such copies block SROA. Converting to value-level
+//   operations or splitting the CFG enables SROA to break aggregates, which
+//   unlocks scalarization/vectorization on AMDGPU.
+//
+// Example pattern:
+//   %src = select i1 %c, ptr %A, ptr %B
+//   call void @llvm.memcpy(ptr %dst, ptr %src, i32 16, i1 false)
+//
+// Objectives:
+// - Canonicalize small memcpy patterns where source or destination is a select
+// of pointers.
+// - Prefer value-level selects (on loaded values) over address-level selects
+// when safe.
+// - When speculation is unsafe, split the CFG to isolate each arm.
+//
+// Assumptions:
+// - Only handles non-volatile memcpy with constant length N where 0 < N <=
+// MaxBytes (default 32).
+// - Source and destination must be in the same address space.
+// - Speculative loads are allowed only if a conservative alignment check
+// passes.
+// - No speculative stores are introduced.
+//
+// Transformations:
+// - Source-select memcpy: attempt speculative loads -> value select -> single
+// store.
+//   Fallback is CFG split with two memcpy calls.
+// - Destination-select memcpy: always CFG split to avoid speculative stores.
+//
+// Run this pass early, before SROA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUVectorIdiom.h"
+#include "AMDGPU.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "amdgpu-vector-idiom"
+
+namespace {
+
+static cl::opt<bool>
+    AMDGPUVectorIdiomEnable("amdgpu-vector-idiom-enable",
+                            cl::desc("Enable pass AMDGPUVectorIdiom"),
+                            cl::init(true));
+
+// Selects an integer or integer-vector element type matching NBytes, using the
+// minimum proven alignment to decide the widest safe element width.
+// Assumptions:
+// - Pointee types are opaque; the element choice is based solely on size and
+// alignment.
+// - Falls back to <N x i8> if wider lanes are not safe/aligned.
+static Type *getIntOrVecTypeForSize(uint64_t NBytes, LLVMContext &Ctx,
+                                    Align MinProvenAlign = Align(1)) {
+  auto CanUseI64 = [&]() { return MinProvenAlign >= Align(8); };
+  auto CanUseI32 = [&]() { return MinProvenAlign >= Align(4); };
+  auto CanUseI16 = [&]() { return MinProvenAlign >= Align(2); };
+
+  if (NBytes == 32 && CanUseI64())
+    return FixedVectorType::get(Type::getInt64Ty(Ctx), 4);
+
+  if ((NBytes % 4) == 0 && CanUseI32())
+    return FixedVectorType::get(Type::getInt32Ty(Ctx), NBytes / 4);
+
+  if ((NBytes % 2) == 0 && CanUseI16())
+    return FixedVectorType::get(Type::getInt16Ty(Ctx), NBytes / 2);
+
+  return FixedVectorType::get(Type::getInt8Ty(Ctx), NBytes);
+}
+
+static Align minAlign(Align A, Align B) { return A < B ? A : B; }
+
+// Checks if both pointer operands can be speculatively loaded for N bytes and
+// computes the minimum alignment to use.
+// Notes:
+// - Intentionally conservative: relies on isDereferenceablePointer and
+//   getOrEnforceKnownAlignment.
+// - AA/TLI are not used for deeper reasoning here.
+// Emits verbose LLVM_DEBUG logs explaining why speculation is disallowed.
+// Return false reasons include: either arm not dereferenceable or computed
+// known alignment < 1.
+static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
+                                         Align &OutAlign, const DataLayout &DL,
+                                         AssumptionCache *AC,
+                                         const DominatorTree *DT,
+                                         Instruction *CtxI) {
+  APInt SizeAPInt(DL.getIndexTypeSizeInBits(A->getType()), Size);
+  if (!isDereferenceableAndAlignedPointer(B, Align(1), SizeAPInt, DL, CtxI, AC,
+                                          DT, nullptr)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: false arm "
+                      << "(B) not dereferenceable for " << Size
+                      << " bytes at align(1)\n");
+    LLVM_DEBUG(dbgs() << "    false arm (B) value: " << *B << '\n');
+    return false;
+  }
+
+  Align AlignB =
+      llvm::getOrEnforceKnownAlignment(B, Align(1), DL, nullptr, AC, DT);
+
+  if (AlignB < Align(1)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known "
+                      << "alignment of false arm (B) < 1: " << AlignB.value()
+                      << '\n');
+    return false;
+  }
+
+  if (!isDereferenceableAndAlignedPointer(A, Align(1), SizeAPInt, DL, CtxI, AC,
+                                          DT, nullptr)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: true arm "
+                      << "(A) not dereferenceable for " << Size
+                      << " bytes at align(1)\n");
+    LLVM_DEBUG(dbgs() << "    true arm (A) value: " << *A << '\n');
+    return false;
+  }
+
+  Align AlignA =
+      llvm::getOrEnforceKnownAlignment(A, Align(1), DL, nullptr, AC, DT);
+
+  if (AlignA < Align(1)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known "
+                      << "alignment of true arm (A) < 1: " << AlignA.value()
+                      << '\n');
+    return false;
+  }
+
+  OutAlign = minAlign(AlignA, AlignB);
+  LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Speculative loads allowed: "
+                    << "minAlign=" << OutAlign.value() << '\n');
+  return true;
+}
+
+struct AMDGPUVectorIdiomImpl {
+  const unsigned MaxBytes;
+  bool CFGChanged = false;
+
+  AMDGPUVectorIdiomImpl(unsigned MaxBytes) : MaxBytes(MaxBytes) {}
+
+  // Rewrites memcpy when the source is a select of pointers. Prefers a
+  // value-level select (two loads + select + one store) if speculative loads
+  // are safe. Otherwise, falls back to a guarded CFG split with two memcpy
+  // calls. Assumptions:
+  // - Non-volatile, constant length, within MaxBytes.
+  // - Source and destination in the same address space.
+  bool transformSelectMemcpySource(MemCpyInst &MT, SelectInst &Sel,
+                                   const DataLayout &DL,
+                                   const DominatorTree *DT,
+                                   AssumptionCache *AC) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Considering memcpy(select-src): "
+                      << MT << '\n');
+    IRBuilder<> B(&MT);
+    Value *Dst = MT.getRawDest();
+    Value *A = Sel.getTrueValue();
+    Value *Bv = Sel.getFalseValue();
+
+    ConstantInt *LenCI = cast<ConstantInt>(MT.getLength());
+    uint64_t N = LenCI->getLimitedValue();
+
+    if (Sel.isVolatile()) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not rewriting: Select marked "
+                        << "volatile (unexpected) in memcpy source\n");
+      return false;
+    }
+
+    // This is a null check - always use CFG split
+    Value *Cond = Sel.getCondition();
+    ICmpInst *ICmp = dyn_cast<ICmpInst>(Cond);
+    if (ICmp && ICmp->isEquality() &&
+        (isa<ConstantPointerNull>(ICmp->getOperand(0)) ||
+         isa<ConstantPointerNull>(ICmp->getOperand(1)))) {
+      splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true);
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Null check pattern - "
+                           "using CFG split\n");
+      return true;
+    }
+
+    Align DstAlign = MaybeAlign(MT.getDestAlign()).valueOrOne();
+    Align AlignAB;
+    bool CanSpeculate = false;
+
+    const CallBase &CB = MT;
+    const unsigned SrcArgIdx = 1;
+    uint64_t DerefBytes = CB.getParamDereferenceableBytes(SrcArgIdx);
+    bool HasDerefOrNull =
+        CB.paramHasAttr(SrcArgIdx, Attribute::DereferenceableOrNull);
+    bool HasNonNull = CB.paramHasAttr(SrcArgIdx, Attribute::NonNull);
+    MaybeAlign SrcParamAlign = CB.getParamAlign(SrcArgIdx);
+    Align ProvenSrcAlign =
+        SrcParamAlign.value_or(MaybeAlign(MT.getSourceAlign()).valueOrOne());
+
+    if (DerefBytes > 0) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param attrs: "
+                        << "dereferenceable(" << DerefBytes << ")"
+                        << (HasDerefOrNull ? " (or null)" : "")
+                        << (HasNonNull ? ", nonnull" : "") << ", align "
+                        << ProvenSrcAlign.value() << '\n');
+      if (DerefBytes >= N && (!HasDerefOrNull || HasNonNull)) {
+        LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Using memcpy source operand "
+                          << "attributes at this use; accepting speculation\n");
+        CanSpeculate = true;
+        AlignAB = ProvenSrcAlign;
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "[AMDGPUVectorIdiom] Source param attrs not strong "
+                   << "enough for speculation: need dereferenceable(" << N
+                   << ") and nonnull; got dereferenceable(" << DerefBytes << ")"
+                   << (HasDerefOrNull ? " (or null)" : "")
+                   << (HasNonNull ? ", nonnull" : "") << '\n');
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param has no "
+                        << "dereferenceable bytes attribute; align "
+                        << ProvenSrcAlign.value() << '\n');
+    }
+    if (!CanSpeculate)
+      CanSpeculate =
+          bothArmsSafeToSpeculateLoads(A, Bv, N, AlignAB, DL, AC, DT, &MT);
+
+    if (CanSpeculate) {
+      Align MinAlign = std::min(AlignAB, DstAlign);
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-src) "
+                        << "with value-level select; N=" << N
+                        << " minAlign=" << MinAlign.value() << '\n');
+
+      Type *Ty = getIntOrVecTypeForSize(N, B.getContext(), MinAlign);
+
+      LoadInst *LA = B.CreateAlignedLoad(Ty, A, MinAlign);
+      LoadInst *LB = B.CreateAlignedLoad(Ty, Bv, MinAlign);
+      Value *V = B.CreateSelect(Sel.getCondition(), LA, LB);
+
+      (void)B.CreateAlignedStore(V, Dst, DstAlign);
+
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) to "
+                           "value-select loads/stores: "
+                        << MT << '\n');
+      MT.eraseFromParent();
+      return true;
+    }
+
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Falling back to CFG split for "
+                      << "memcpy(select-src); speculation unsafe\n");
+    splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true);
+    LLVM_DEBUG(
+        dbgs()
+        << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) by CFG split\n");
+    return true;
+  }
+
+  // Rewrites memcpy when the destination is a select of pointers. To avoid
+  // speculative stores, always splits the CFG and emits a memcpy per branch.
+  // Assumptions mirror the source case.
+  bool transformSelectMemcpyDest(MemCpyInst &MT, SelectInst &Sel) {
+    Value *DA = Sel.getTrueValue();
+    Value *DB = Sel.getFalseValue();
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-dst) via "
+                      << "CFG split to avoid speculative stores: " << MT
+                      << '\n');
+
+    splitCFGForMemcpy(MT, Sel.getCondition(), DA, DB, false);
+    LLVM_DEBUG(
+        dbgs()
+        << "[AMDGPUVectorIdiom] Rewrote memcpy(select-dst) by CFG split\n");
+    return true;
+  }
+
+  // Splits the CFG around a memcpy whose source or destination depends on a
+  // condition. Clones memcpy in then/else using TruePtr/FalsePtr and rejoins.
+  // Assumptions:
+  // - MT has constant length and is non-volatile.
+  // - TruePtr/FalsePtr are correct replacements for the selected operand.
+  void splitCFGForMemcpy(MemCpyInst &MT, Value *Cond, Value *TruePtr,
+                         Value *FalsePtr, bool IsSource) {
+    CFGChanged = true;
+
+    Function *F = MT.getFunction();
+    BasicBlock *Cur = MT.getParent();
+    BasicBlock *ThenBB = BasicBlock::Create(F->getContext(), "memcpy.then", F);
+    BasicBlock *ElseBB = BasicBlock::Create(F->getContext(), "memcpy.else", F);
+    BasicBlock *JoinBB =
+        Cur->splitBasicBlock(BasicBlock::iterator(&MT), "memcpy.join");
+
+    Cur->getTerminator()->eraseFromParent();
+    IRBuilder<> B(Cur);
+    B.CreateCondBr(Cond, ThenBB, ElseBB);
+
+    ConstantInt *LenCI = cast<ConstantInt>(MT.getLength());
+
+    IRBuilder<> BT(ThenBB);
+    if (IsSource) {
+      (void)BT.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), TruePtr,
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    } else {
+      (void)BT.CreateMemCpy(TruePtr, MT.getDestAlign(), MT.getRawSource(),
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    }
+    BT.CreateBr(JoinBB);
+
+    IRBuilder<> BE(ElseBB);
+    if (IsSource) {
+      (void)BE.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), FalsePtr,
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    } else {
+      (void)BE.CreateMemCpy(FalsePtr, MT.getDestAlign(), MT.getRawSource(),
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    }
+    BE.CreateBr(JoinBB);
+
+    MT.eraseFromParent();
+  }
+};
+
+} // end anonymous namespace
+
+AMDGPUVectorIdiomCombinePass::AMDGPUVectorIdiomCombinePass(unsigned MaxBytes)
+    : MaxBytes(MaxBytes) {}
+
+// Pass driver that locates small, constant-size, non-volatile memcpy calls
+// where source or destination is a select in the same address space. Applies
+// the source/destination transforms described above. Intended to run early to
+// maximize SROA and subsequent optimizations.
+PreservedAnalyses
+AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+
+  if (!AMDGPUVectorIdiomEnable)
+    return PreservedAnalyses::all();
+
+  SmallVector<MemCpyInst *, 8> Worklist;
+  for (Instruction &I : instructions(F)) {
+    if (auto *MC = dyn_cast<MemCpyInst>(&I))
+      Worklist.push_back(MC);
+  }
+
+  bool Changed = false;
+  AMDGPUVectorIdiomImpl Impl(MaxBytes);
+
+  for (MemCpyInst *MT : Worklist) {
+    Value *Dst = MT->getRawDest();
+    Value *Src = MT->getRawSource();
+    if (!isa<SelectInst>(Src) && !isa<SelectInst>(Dst))
+      continue;
+
+    LLVM_DEBUG({
+      Value *DstV = MT->getRawDest();
+      Value *SrcV = MT->getRawSource();
+      unsigned DstAS = cast<PointerType>(DstV->getType())->getAddressSpace();
+      unsigned SrcAS = cast<PointerType>(SrcV->getType())->getAddressSpace();
+      Value *LenV = MT->getLength();
+
+      auto dumpPtrForms = [&](StringRef Label, Value *V) {
+        dbgs() << "      " << Label << ": " << *V << '\n';
+
+        Value *StripCasts = V->stripPointerCasts();
+        if (StripCasts != V)
+          dbgs() << "        - stripCasts: " << *StripCasts << '\n';
+        else
+          dbgs() << "        - stripCasts: (no change)\n";
+
+        Value *Underlying = getUnderlyingObject(V);
+        if (Underlying != V)
+          dbgs() << "        - underlying: " << *Underlying << '\n';
+        else
+          dbgs() << "        - underlying: (no change)\n";
+      };
+
+      auto dumpSelect = [&](StringRef Which, Value *V) {
+        if (auto *SI = dyn_cast<SelectInst>(V)) {
+          dbgs() << "  - " << Which << " is Select: " << *SI << '\n';
+          dbgs() << "      cond: " << *SI->getCondition() << '\n';
+          Value *T = SI->getTrueValue();
+          Value *Fv = SI->getFalseValue();
+          dumpPtrForms("true", T);
+          dumpPtrForms("false", Fv);
+        }
+      };
+
+      dbgs() << "[AMDGPUVectorIdiom] Found memcpy: " << *MT << '\n'
+             << "  in function: " << F.getName() << '\n'
+             << "  - volatile=" << (MT->isVolatile() ? "true" : "false") << '\n'
+             << "  - sameAS=" << (DstAS == SrcAS ? "true" : "false")
+             << " (dstAS=" << DstAS << ", srcAS=" << SrcAS << ")\n"
+             << "  - constLen=" << (isa<ConstantInt>(LenV) ? "true" : "false");
+      if (auto *LCI = dyn_cast<ConstantInt>(LenV))
+        dbgs() << " (N=" << LCI->getLimitedValue() << ")";
+      dbgs() << '\n'
+             << "  - srcIsSelect=" << (isa<SelectInst>(SrcV) ? "true" : "false")
+             << '\n'
+             << "  - dstIsSelect=" << (isa<SelectInst>(DstV) ? "true" : "false")
+             << '\n';
+
+      dumpSelect("src", SrcV);
+      dumpSelect("dst", DstV);
+    });
+
+    if (MT->isVolatile()) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy is volatile\n");
+      continue;
+    }
+
+    ConstantInt *LenCI = dyn_cast<ConstantInt>(MT->getLength());
+    if (!LenCI) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy length is not a "
+                        << "constant integer\n");
+      continue;
+    }
+
+    uint64_t N = LenCI->getLimitedValue();
+    if (N == 0 || N > MaxBytes) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy size out of range "
+                        << "(N=" << N << ", MaxBytes=" << MaxBytes << ")\n");
+      continue;
+    }
+
+    unsigned DstAS = cast<PointerType>(Dst->getType())->getAddressSpace();
+    unsigned SrcAS = cast<PointerType>(Src->getType())->getAddressSpace();
+    if (DstAS != SrcAS) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: address space mismatch "
+                        << "(dstAS=" << DstAS << ", srcAS=" << SrcAS << ")\n");
+      continue;
+    }
+
+    if (auto *Sel = dyn_cast<SelectInst>(Src)) {
+      Changed |= Impl.transformSelectMemcpySource(*MT, *Sel, DL, &DT, &AC);
+      continue;
+    }
+    if (auto *Sel = dyn_cast<SelectInst>(Dst)) {
+      Changed |= Impl.transformSelectMemcpyDest(*MT, *Sel);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: neither source nor "
+                      << "destination is a select of pointers\n");
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // Be conservative: preserve only analyses we know remain valid.
+  PreservedAnalyses PA;
+  PA.preserve<AssumptionAnalysis>();
+  PA.preserve<TargetLibraryAnalysis>();
+  PA.preserve<TargetIRAnalysis>();
+
+  // If we didn't change the CFG, we can keep DT/LI/PostDT.
+  if (!Impl.CFGChanged) {
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<PostDominatorTreeAnalysis>();
+  }
+
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h
new file mode 100644
index 0000000000000..339a604a092a0
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h
@@ -0,0 +1,43 @@
+//===- AMDGPUVectorIdiom.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU-specific vector idiom canonicalizations to unblock SROA and
+// subsequent scalarization/vectorization.
+//
+// This pass rewrites memcpy with select-fed operands into either:
+//  - a value-level select (two loads + select + store), when safe to
+//    speculatively load both arms, or
+//  - a conservative CFG split around the condition to isolate each arm.
+//
+// Run this pass early, before SROA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H
+
+#include "AMDGPU.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class AMDGPUVectorIdiomCombinePass
+    : public PassInfoMixin<AMDGPUVectorIdiomCombinePass> {
+  unsigned MaxBytes;
+
+public:
+  /// \p MaxBytes is max memcpy size (in bytes) to transform in
+  /// AMDGPUVectorIdiom
+  AMDGPUVectorIdiomCombinePass(unsigned MaxBytes);
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index aae56eef73edd..a90512d77c288 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetTransformInfo.cpp
   AMDGPUWaitSGPRHazards.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
+  AMDGPUVectorIdiom.cpp
   R600MachineCFGStructurizer.cpp
   GCNCreateVOPD.cpp
   GCNDPPCombine.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll
new file mode 100644
index 0000000000000..0354dcc3c818d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll
@@ -0,0 +1,424 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -amdgpu-vector-idiom-enable -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-vector-idiom -S %s | FileCheck %s
+
+; This test verifies the AMDGPUVectorIdiomCombinePass transforms:
+; 1) memcpy with select-fed source into a value-level select between two loads,
+;    followed by one store (when it's safe to speculate both loads).
+; 2) memcpy with select-fed destination into a control-flow split with two memcpys.
+
+ at G0 = addrspace(1) global [4 x i32] zeroinitializer, align 16
+ at G1 = addrspace(1) global [4 x i32] zeroinitializer, align 16
+
+declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg)
+
+; -----------------------------------------------------------------------------
+; Source is a select. Expect value-level select of two <4 x i32> loads
+; and a single store, with no remaining memcpy.
+;
+define amdgpu_kernel void @value_select_src(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @value_select_src(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @G0, i64 0, i64 0
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @G1, i64 0, i64 0
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[A]], ptr addrspace(1) [[B]]
+; CHECK-NEXT:    [[LA:%.*]] = load <4 x i32>, ptr addrspace(1) [[A]], align 16
+; CHECK-NEXT:    [[LB:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 16
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], <4 x i32> [[LA]], <4 x i32> [[LB]]
+; CHECK-NEXT:    store <4 x i32> [[SEL]], ptr addrspace(1) [[DST]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  ; Pointers to two 16-byte aligned buffers in the same addrspace(1).
+  %pa = getelementptr inbounds [4 x i32], ptr addrspace(1) @G0, i64 0, i64 0
+  %pb = getelementptr inbounds [4 x i32], ptr addrspace(1) @G1, i64 0, i64 0
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+
+  ; Provide explicit operand alignments so the pass can emit an aligned store.
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) align 16 %dst,
+  ptr addrspace(1) align 16 %src,
+  i64 16, i1 false)
+
+  ret void
+}
+
+; -----------------------------------------------------------------------------
+; Destination is a select. Expect CFG split with two memcpys guarded
+; by a branch (we do not speculate stores in this pass).
+;
+define amdgpu_kernel void @dest_select_cfg_split(ptr addrspace(1) %da, ptr addrspace(1) %db,
+; CHECK-LABEL: define amdgpu_kernel void @dest_select_cfg_split(
+; CHECK-SAME: ptr addrspace(1) [[DA:%.*]], ptr addrspace(1) [[DB:%.*]], ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) [[DA]], ptr addrspace(1) [[DB]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
+; CHECK:       [[MEMCPY_JOIN:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MEMCPY_THEN]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DA]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+; CHECK:       [[MEMCPY_ELSE]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DB]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+;
+  ptr addrspace(1) %src, i1 %cond) {
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) %da, ptr addrspace(1) %db
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+
+; -----------------------------------------------------------------------------
+; Source is a select, 4 x double (32 bytes).
+; Expect value-level select of two <4 x i64> loads and a single store, no memcpy.
+;
+ at G2 = addrspace(1) global [4 x double] zeroinitializer, align 32
+ at G3 = addrspace(1) global [4 x double] zeroinitializer, align 32
+define amdgpu_kernel void @value_select_src_4xd(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @value_select_src_4xd(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PA:%.*]] = getelementptr inbounds [4 x double], ptr addrspace(1) @G2, i64 0, i64 0
+; CHECK-NEXT:    [[PB:%.*]] = getelementptr inbounds [4 x double], ptr addrspace(1) @G3, i64 0, i64 0
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr addrspace(1) [[PA]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[PB]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <4 x i64> [[TMP0]], <4 x i64> [[TMP1]]
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr addrspace(1) [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pa = getelementptr inbounds [4 x double], ptr addrspace(1) @G2, i64 0, i64 0
+  %pb = getelementptr inbounds [4 x double], ptr addrspace(1) @G3, i64 0, i64 0
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) align 32 %dst,
+  ptr addrspace(1) align 32 %src,
+  i64 32, i1 false)
+
+  ret void
+}
+
+; -----------------------------------------------------------------------------
+; Source is a select, 3 x char (3 bytes).
+; Expect value-level select using <3 x i8> loads/stores, no memcpy.
+;
+ at G4 = addrspace(1) global [3 x i8] zeroinitializer, align 1
+ at G5 = addrspace(1) global [3 x i8] zeroinitializer, align 1
+define amdgpu_kernel void @value_select_src_3xc(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @value_select_src_3xc(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PA:%.*]] = getelementptr inbounds [3 x i8], ptr addrspace(1) @G4, i64 0, i64 0
+; CHECK-NEXT:    [[PB:%.*]] = getelementptr inbounds [3 x i8], ptr addrspace(1) @G5, i64 0, i64 0
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i8>, ptr addrspace(1) [[PA]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, ptr addrspace(1) [[PB]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <3 x i8> [[TMP0]], <3 x i8> [[TMP1]]
+; CHECK-NEXT:    store <3 x i8> [[TMP2]], ptr addrspace(1) [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pa = getelementptr inbounds [3 x i8], ptr addrspace(1) @G4, i64 0, i64 0
+  %pb = getelementptr inbounds [3 x i8], ptr addrspace(1) @G5, i64 0, i64 0
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) align 1 %dst,
+  ptr addrspace(1) align 1 %src,
+  i64 3, i1 false)
+
+  ret void
+}
+
+; -----------------------------------------------------------------------------
+; Source is a select with constant expression GEP arms.
+; Expect value-level select: two loads + select + store, no memcpy.
+;
+ at GEPA = addrspace(1) global [4 x i32] zeroinitializer, align 16
+ at GEPB = addrspace(1) global [4 x i32] zeroinitializer, align 16
+define amdgpu_kernel void @value_select_src_constexpr_gep(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @value_select_src_constexpr_gep(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) @GEPA, ptr addrspace(1) @GEPB
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) @GEPA, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) @GEPB, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]]
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[DST]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  ; Constant expression GEPs to the base elements
+  %src = select i1 %cond,
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPA, i64 0, i64 0),
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPB, i64 0, i64 0)
+
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) align 16 %dst,
+  ptr addrspace(1) align 16 %src,
+  i64 16, i1 false)
+
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Destination is a select with constant expression GEP arms.
+; Expect CFG split with two memcpys.
+;
+define amdgpu_kernel void @dest_select_constexpr_gep(ptr addrspace(1) %src, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @dest_select_constexpr_gep(
+; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) @GEPA, ptr addrspace(1) @GEPB
+; CHECK-NEXT:    br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
+; CHECK:       [[MEMCPY_JOIN:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MEMCPY_THEN]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) @GEPA, ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+; CHECK:       [[MEMCPY_ELSE]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) @GEPB, ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+;
+entry:
+  %dst = select i1 %cond,
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPA, i64 0, i64 0),
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPB, i64 0, i64 0)
+
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Source is a select where one arm is null.
+; Expect CFG split (no speculative loads).
+;
+ at GN = addrspace(1) global [4 x i32] zeroinitializer, align 16
+define amdgpu_kernel void @src_select_null_arm(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @src_select_null_arm(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[NONNULL:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @GN, i64 0, i64 0
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[NONNULL]], ptr addrspace(1) null
+; CHECK-NEXT:    br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
+; CHECK:       [[MEMCPY_JOIN:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MEMCPY_THEN]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[NONNULL]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+; CHECK:       [[MEMCPY_ELSE]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) null, i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+;
+entry:
+  %nonnull = getelementptr inbounds [4 x i32], ptr addrspace(1) @GN, i64 0, i64 0
+  %src = select i1 %cond, ptr addrspace(1) %nonnull, ptr addrspace(1) null
+
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Destination is a select where one arm is null.
+; Expect CFG split (no speculative stores).
+;
+define amdgpu_kernel void @dst_select_null_arm(ptr addrspace(1) %src, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @dst_select_null_arm(
+; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) null, ptr addrspace(1) @GN
+; CHECK-NEXT:    br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
+; CHECK:       [[MEMCPY_JOIN:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MEMCPY_THEN]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) null, ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+; CHECK:       [[MEMCPY_ELSE]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) @GN, ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+;
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) null,
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GN, i64 0, i64 0)
+
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Source is a select where one arm is poison.
+; Expect CFG split (speculative use of poison is not allowed).
+;
+ at GP = addrspace(1) global [4 x i32] zeroinitializer, align 16
+define amdgpu_kernel void @src_select_poison_arm(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @src_select_poison_arm(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[NONNULL:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @GP, i64 0, i64 0
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[NONNULL]], ptr addrspace(1) poison
+; CHECK-NEXT:    br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
+; CHECK:       [[MEMCPY_JOIN:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MEMCPY_THEN]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[NONNULL]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+; CHECK:       [[MEMCPY_ELSE]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) poison, i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+;
+entry:
+  %nonnull = getelementptr inbounds [4 x i32], ptr addrspace(1) @GP, i64 0, i64 0
+  %src = select i1 %cond, ptr addrspace(1) %nonnull, ptr addrspace(1) poison
+
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Destination is a select where one arm is poison.
+; Expect CFG split.
+;
+define amdgpu_kernel void @dst_select_poison_arm(ptr addrspace(1) %src, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @dst_select_poison_arm(
+; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) poison, ptr addrspace(1) @GP
+; CHECK-NEXT:    br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
+; CHECK:       [[MEMCPY_JOIN:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MEMCPY_THEN]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) poison, ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+; CHECK:       [[MEMCPY_ELSE]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) @GP, ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+;
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) poison,
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GP, i64 0, i64 0)
+
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Non-constant memcpy length: the pass should not transform.
+; Expect: memcpy remains as-is (no load/select/store, no CFG split).
+;
+define amdgpu_kernel void @memcpy_nonconst_length_src_select(ptr addrspace(1) %dst,
+; CHECK-LABEL: define amdgpu_kernel void @memcpy_nonconst_length_src_select(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 [[N]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %pa,
+  ptr addrspace(1) %pb,
+  i1 %cond, i64 %n) {
+entry:
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst,
+  ptr addrspace(1) %src,
+  i64 %n, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Non-constant memcpy length with destination select: pass should not transform.
+; Expect: memcpy remains, no CFG split.
+;
+define amdgpu_kernel void @memcpy_nonconst_length_dst_select(ptr addrspace(1) %da,
+; CHECK-LABEL: define amdgpu_kernel void @memcpy_nonconst_length_dst_select(
+; CHECK-SAME: ptr addrspace(1) [[DA:%.*]], ptr addrspace(1) [[DB:%.*]], ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) [[DA]], ptr addrspace(1) [[DB]]
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 [[N]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %db,
+  ptr addrspace(1) %src,
+  i1 %cond, i64 %n) {
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) %da, ptr addrspace(1) %db
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst,
+  ptr addrspace(1) %src,
+  i64 %n, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; "Non-constant source" scenario: select arms are function args (not globals).
+; No strong dereferenceable/align attrs -> speculation should be unsafe,
+; so the pass should split CFG and materialize two memcpys.
+;
+define amdgpu_kernel void @memcpy_src_select_arg_arms_cfg_split(ptr addrspace(1) %dst,
+; CHECK-LABEL: define amdgpu_kernel void @memcpy_src_select_arg_arms_cfg_split(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
+; CHECK:       [[MEMCPY_JOIN:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MEMCPY_THEN]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[PA]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+; CHECK:       [[MEMCPY_ELSE]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[PB]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+;
+  ptr addrspace(1) %pa,
+  ptr addrspace(1) %pb,
+  i1 %cond) {
+entry:
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst,
+  ptr addrspace(1) %src,
+  i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; memmove should be ignored by the pass even with select-fed source/dest.
+; Expect: memmove remains as-is (no CFG split, no speculative transform).
+;
+declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly,
+  ptr addrspace(1) nocapture readonly,
+  i64, i1 immarg)
+
+define amdgpu_kernel void @memmove_ignored_src_select(ptr addrspace(1) %dst,
+; CHECK-LABEL: define amdgpu_kernel void @memmove_ignored_src_select(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
+; CHECK-NEXT:    call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %pa,
+  ptr addrspace(1) %pb,
+  i1 %cond) {
+entry:
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst,
+  ptr addrspace(1) %src,
+  i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; memset should be ignored by the pass, even if destination is a select.
+; Expect: memset remains as-is (no CFG split).
+;
+declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly,
+  i8, i64, i1 immarg)
+
+define amdgpu_kernel void @memset_ignored_dst_select(ptr addrspace(1) %da,
+; CHECK-LABEL: define amdgpu_kernel void @memset_ignored_dst_select(
+; CHECK-SAME: ptr addrspace(1) [[DA:%.*]], ptr addrspace(1) [[DB:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) [[DA]], ptr addrspace(1) [[DB]]
+; CHECK-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %db,
+  i1 %cond) {
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) %da, ptr addrspace(1) %db
+  call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 0, i64 16, i1 false)
+  ret void
+}