[llvm] [AMDGPU] Identify vector idiom to unlock SROA (PR #161200)

Mon Sep 29 07:02:18 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Yaxun (Sam) Liu (yxsamliu)

<details>
<summary>Changes</summary>

HIP vector types often lower to aggregates and get copied with memcpy. When the source or destination is chosen via a pointer select, SROA cannot split the aggregate. This keeps data in stack slots and increases scratch traffic. By rewriting these memcpy idioms, we enable SROA to promote values, reducing stack usage and improving occupancy and bandwidth on AMD GPUs.

For example:

%p = select i1 %cond, ptr %A, ptr %B

call void @llvm.memcpy.p0.p0.i32(ptr %dst, ptr %p, i32 16, i1 false)

When the source is a pointer select and conditions allow, the pass replaces the memcpy with two aligned loads, a value-level select of the loaded vector, and one aligned store. If it is not safe to speculate both loads, it splits control flow and emits a memcpy in each arm. When the destination is a select, it always splits control flow to avoid speculative stores. Vector element types are chosen based on size and minimum proven alignment to minimize the number of operations.

The pass handles non-volatile, constant-length memcpy up to a small size cap. Source and destination must be in the same address space. It runs early, after inlining and before InferAddressSpaces and SROA. Volatile and cross-address-space memcpys are skipped.

The size cap is controlled by -amdgpu-vector-idiom-max-bytes (default 32), allowing tuning for different workloads.

Fixes: SWDEV-550134

---

Patch is 44.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161200.diff


6 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+9) 
- (added) llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp (+519) 
- (added) llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h (+43) 
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1) 
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll (+391) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 9449e70930913..1e730218722b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,8 @@ FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
 FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
+FUNCTION_PASS("amdgpu-vector-idiom",
+              AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32))
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
 #undef FUNCTION_PASS
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b5771b6..1249e25114e1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -34,6 +34,7 @@
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUUnifyDivergentExitNodes.h"
+#include "AMDGPUVectorIdiom.h"
 #include "AMDGPUWaitSGPRHazards.h"
 #include "GCNDPPCombine.h"
 #include "GCNIterativeScheduler.h"
@@ -905,6 +906,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
             EnablePromoteKernelArguments)
           FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
 
+        // Run vector-idiom canonicalization early (after inlining) and before
+        // infer-AS / SROA to maximize scalarization opportunities.
+        // Specify 32 bytes since the largest HIP vector types are double4 or
+        // long4.
+        FPM.addPass(AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32));
+
         // Add infer address spaces pass to the opt pipeline after inlining
         // but before SROA to increase SROA opportunities.
         FPM.addPass(InferAddressSpacesPass());
@@ -953,6 +960,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         if (EnableLowerModuleLDS)
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
         if (Level != OptimizationLevel::O0) {
+          PM.addPass(createModuleToFunctionPassAdaptor(
+              AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32)));
           // We only want to run this with O2 or higher since inliner and SROA
           // don't run in O1.
           if (Level != OptimizationLevel::O1) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp
new file mode 100644
index 0000000000000..2703fff9f4e9a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp
@@ -0,0 +1,519 @@
+//===- AMDGPUVectorIdiom.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AMDGPU-specific vector idiom canonicalizations to unblock SROA and
+// subsequent scalarization/vectorization.
+//
+// Motivation:
+// - HIP vector types are often modeled as structs and copied with memcpy.
+//   Address-level selects on such copies block SROA. Converting to value-level
+//   operations or splitting the CFG enables SROA to break aggregates, which
+//   unlocks scalarization/vectorization on AMDGPU.
+//
+// Example pattern:
+//   %src = select i1 %c, ptr %A, ptr %B
+//   call void @llvm.memcpy(ptr %dst, ptr %src, i32 16, i1 false)
+//
+// Objectives:
+// - Canonicalize small memcpy patterns where source or destination is a select
+// of pointers.
+// - Prefer value-level selects (on loaded values) over address-level selects
+// when safe.
+// - When speculation is unsafe, split the CFG to isolate each arm.
+//
+// Assumptions:
+// - Only handles non-volatile memcpy with constant length N where 0 < N <=
+// MaxBytes (default 32).
+// - Source and destination must be in the same address space.
+// - Speculative loads are allowed only if a conservative alignment check
+// passes.
+// - No speculative stores are introduced.
+//
+// Transformations:
+// - Source-select memcpy: attempt speculative loads -> value select -> single
+// store.
+//   Fallback is CFG split with two memcpy calls.
+// - Destination-select memcpy: always CFG split to avoid speculative stores.
+//
+// Run this pass early, before SROA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUVectorIdiom.h"
+#include "AMDGPU.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "amdgpu-vector-idiom"
+
+namespace {
+
+static cl::opt<bool>
+    AMDGPUVectorIdiomEnable("amdgpu-vector-idiom-enable",
+                            cl::desc("Enable pass AMDGPUVectorIdiom"),
+                            cl::init(true));
+
+// Selects an integer or integer-vector element type matching NBytes, using the
+// minimum proven alignment to decide the widest safe element width.
+// Assumptions:
+// - Pointee types are opaque; the element choice is based solely on size and
+// alignment.
+// - Falls back to <N x i8> if wider lanes are not safe/aligned.
+static Type *getIntOrVecTypeForSize(uint64_t NBytes, LLVMContext &Ctx,
+                                    Align MinProvenAlign = Align(1)) {
+  auto CanUseI64 = [&]() { return MinProvenAlign >= Align(8); };
+  auto CanUseI32 = [&]() { return MinProvenAlign >= Align(4); };
+  auto CanUseI16 = [&]() { return MinProvenAlign >= Align(2); };
+
+  if (NBytes == 32 && CanUseI64())
+    return FixedVectorType::get(Type::getInt64Ty(Ctx), 4);
+
+  if ((NBytes % 4) == 0 && CanUseI32())
+    return FixedVectorType::get(Type::getInt32Ty(Ctx), NBytes / 4);
+
+  if ((NBytes % 2) == 0 && CanUseI16())
+    return FixedVectorType::get(Type::getInt16Ty(Ctx), NBytes / 2);
+
+  return FixedVectorType::get(Type::getInt8Ty(Ctx), NBytes);
+}
+
+static Align minAlign(Align A, Align B) { return A < B ? A : B; }
+
+// Checks if the underlying object of a memcpy operand is an alloca.
+// This helps focus on scratch memory optimizations by filtering out
+// memcpy operations that don't involve stack-allocated memory.
+static bool hasAllocaUnderlyingObject(Value *V) {
+  Value *Underlying = getUnderlyingObject(V);
+  return isa<AllocaInst>(Underlying);
+}
+
+// Checks if both pointer operands can be speculatively loaded for N bytes and
+// computes the minimum alignment to use.
+// Notes:
+// - Intentionally conservative: relies on isDereferenceablePointer and
+//   getOrEnforceKnownAlignment.
+// - AA/TLI are not used for deeper reasoning here.
+// Emits verbose LLVM_DEBUG logs explaining why speculation is disallowed.
+// Return false reasons include: either arm not dereferenceable or computed
+// known alignment < 1.
+static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
+                                         Align &OutAlign, const DataLayout &DL,
+                                         AssumptionCache *AC,
+                                         const DominatorTree *DT,
+                                         Instruction *CtxI) {
+  APInt SizeAPInt(DL.getIndexTypeSizeInBits(A->getType()), Size);
+  if (!isDereferenceableAndAlignedPointer(B, Align(1), SizeAPInt, DL, CtxI, AC,
+                                          DT, nullptr)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: false arm "
+                      << "(B) not dereferenceable for " << Size
+                      << " bytes at align(1)\n");
+    LLVM_DEBUG(dbgs() << "    false arm (B) value: " << *B << '\n');
+    return false;
+  }
+
+  Align AlignB =
+      llvm::getOrEnforceKnownAlignment(B, Align(1), DL, nullptr, AC, DT);
+
+  if (AlignB < Align(1)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known "
+                      << "alignment of false arm (B) < 1: " << AlignB.value()
+                      << '\n');
+    return false;
+  }
+
+  if (!isDereferenceableAndAlignedPointer(A, Align(1), SizeAPInt, DL, CtxI, AC,
+                                          DT, nullptr)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: true arm "
+                      << "(A) not dereferenceable for " << Size
+                      << " bytes at align(1)\n");
+    LLVM_DEBUG(dbgs() << "    true arm (A) value: " << *A << '\n');
+    return false;
+  }
+
+  Align AlignA =
+      llvm::getOrEnforceKnownAlignment(A, Align(1), DL, nullptr, AC, DT);
+
+  if (AlignA < Align(1)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known "
+                      << "alignment of true arm (A) < 1: " << AlignA.value()
+                      << '\n');
+    return false;
+  }
+
+  OutAlign = minAlign(AlignA, AlignB);
+  LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Speculative loads allowed: "
+                    << "minAlign=" << OutAlign.value() << '\n');
+  return true;
+}
+
+struct AMDGPUVectorIdiomImpl {
+  const unsigned MaxBytes;
+  bool CFGChanged = false;
+
+  AMDGPUVectorIdiomImpl(unsigned MaxBytes) : MaxBytes(MaxBytes) {}
+
+  // Rewrites memcpy when the source is a select of pointers. Prefers a
+  // value-level select (two loads + select + one store) if speculative loads
+  // are safe. Otherwise, falls back to a guarded CFG split with two memcpy
+  // calls. Assumptions:
+  // - Non-volatile, constant length, within MaxBytes.
+  // - Source and destination in the same address space.
+  bool transformSelectMemcpySource(MemCpyInst &MT, SelectInst &Sel,
+                                   const DataLayout &DL,
+                                   const DominatorTree *DT,
+                                   AssumptionCache *AC) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Considering memcpy(select-src): "
+                      << MT << '\n');
+    IRBuilder<> B(&MT);
+    Value *Dst = MT.getRawDest();
+    Value *A = Sel.getTrueValue();
+    Value *Bv = Sel.getFalseValue();
+
+    ConstantInt *LenCI = cast<ConstantInt>(MT.getLength());
+    uint64_t N = LenCI->getLimitedValue();
+
+    if (Sel.isVolatile()) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not rewriting: Select marked "
+                        << "volatile (unexpected) in memcpy source\n");
+      return false;
+    }
+
+    // This is a null check - always use CFG split
+    Value *Cond = Sel.getCondition();
+    ICmpInst *ICmp = dyn_cast<ICmpInst>(Cond);
+    if (ICmp && ICmp->isEquality() &&
+        (isa<ConstantPointerNull>(ICmp->getOperand(0)) ||
+         isa<ConstantPointerNull>(ICmp->getOperand(1)))) {
+      splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true);
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Null check pattern - "
+                           "using CFG split\n");
+      return true;
+    }
+
+    Align DstAlign = MaybeAlign(MT.getDestAlign()).valueOrOne();
+    Align AlignAB;
+    bool CanSpeculate = false;
+
+    const CallBase &CB = MT;
+    const unsigned SrcArgIdx = 1;
+    uint64_t DerefBytes = CB.getParamDereferenceableBytes(SrcArgIdx);
+    bool HasDerefOrNull =
+        CB.paramHasAttr(SrcArgIdx, Attribute::DereferenceableOrNull);
+    bool HasNonNull = CB.paramHasAttr(SrcArgIdx, Attribute::NonNull);
+    MaybeAlign SrcParamAlign = CB.getParamAlign(SrcArgIdx);
+    Align ProvenSrcAlign =
+        SrcParamAlign.value_or(MaybeAlign(MT.getSourceAlign()).valueOrOne());
+
+    if (DerefBytes > 0) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param attrs: "
+                        << "dereferenceable(" << DerefBytes << ")"
+                        << (HasDerefOrNull ? " (or null)" : "")
+                        << (HasNonNull ? ", nonnull" : "") << ", align "
+                        << ProvenSrcAlign.value() << '\n');
+      if (DerefBytes >= N && (!HasDerefOrNull || HasNonNull)) {
+        LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Using memcpy source operand "
+                          << "attributes at this use; accepting speculation\n");
+        CanSpeculate = true;
+        AlignAB = ProvenSrcAlign;
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "[AMDGPUVectorIdiom] Source param attrs not strong "
+                   << "enough for speculation: need dereferenceable(" << N
+                   << ") and nonnull; got dereferenceable(" << DerefBytes << ")"
+                   << (HasDerefOrNull ? " (or null)" : "")
+                   << (HasNonNull ? ", nonnull" : "") << '\n');
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param has no "
+                        << "dereferenceable bytes attribute; align "
+                        << ProvenSrcAlign.value() << '\n');
+    }
+    if (!CanSpeculate)
+      CanSpeculate =
+          bothArmsSafeToSpeculateLoads(A, Bv, N, AlignAB, DL, AC, DT, &MT);
+
+    if (CanSpeculate) {
+      Align MinAlign = std::min(AlignAB, DstAlign);
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-src) "
+                        << "with value-level select; N=" << N
+                        << " minAlign=" << MinAlign.value() << '\n');
+
+      Type *Ty = getIntOrVecTypeForSize(N, B.getContext(), MinAlign);
+
+      LoadInst *LA = B.CreateAlignedLoad(Ty, A, MinAlign);
+      LoadInst *LB = B.CreateAlignedLoad(Ty, Bv, MinAlign);
+      Value *V = B.CreateSelect(Sel.getCondition(), LA, LB);
+
+      (void)B.CreateAlignedStore(V, Dst, DstAlign);
+
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) to "
+                           "value-select loads/stores: "
+                        << MT << '\n');
+      MT.eraseFromParent();
+      return true;
+    }
+
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Falling back to CFG split for "
+                      << "memcpy(select-src); speculation unsafe\n");
+    splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true);
+    LLVM_DEBUG(
+        dbgs()
+        << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) by CFG split\n");
+    return true;
+  }
+
+  // Rewrites memcpy when the destination is a select of pointers. To avoid
+  // speculative stores, always splits the CFG and emits a memcpy per branch.
+  // Assumptions mirror the source case.
+  bool transformSelectMemcpyDest(MemCpyInst &MT, SelectInst &Sel) {
+    Value *DA = Sel.getTrueValue();
+    Value *DB = Sel.getFalseValue();
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-dst) via "
+                      << "CFG split to avoid speculative stores: " << MT
+                      << '\n');
+
+    splitCFGForMemcpy(MT, Sel.getCondition(), DA, DB, false);
+    LLVM_DEBUG(
+        dbgs()
+        << "[AMDGPUVectorIdiom] Rewrote memcpy(select-dst) by CFG split\n");
+    return true;
+  }
+
+  // Splits the CFG around a memcpy whose source or destination depends on a
+  // condition. Clones memcpy in then/else using TruePtr/FalsePtr and rejoins.
+  // Assumptions:
+  // - MT has constant length and is non-volatile.
+  // - TruePtr/FalsePtr are correct replacements for the selected operand.
+  void splitCFGForMemcpy(MemCpyInst &MT, Value *Cond, Value *TruePtr,
+                         Value *FalsePtr, bool IsSource) {
+    CFGChanged = true;
+
+    Function *F = MT.getFunction();
+    BasicBlock *Cur = MT.getParent();
+    BasicBlock *ThenBB = BasicBlock::Create(F->getContext(), "memcpy.then", F);
+    BasicBlock *ElseBB = BasicBlock::Create(F->getContext(), "memcpy.else", F);
+    BasicBlock *JoinBB =
+        Cur->splitBasicBlock(BasicBlock::iterator(&MT), "memcpy.join");
+
+    Cur->getTerminator()->eraseFromParent();
+    IRBuilder<> B(Cur);
+    B.CreateCondBr(Cond, ThenBB, ElseBB);
+
+    ConstantInt *LenCI = cast<ConstantInt>(MT.getLength());
+
+    IRBuilder<> BT(ThenBB);
+    if (IsSource) {
+      (void)BT.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), TruePtr,
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    } else {
+      (void)BT.CreateMemCpy(TruePtr, MT.getDestAlign(), MT.getRawSource(),
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    }
+    BT.CreateBr(JoinBB);
+
+    IRBuilder<> BE(ElseBB);
+    if (IsSource) {
+      (void)BE.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), FalsePtr,
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    } else {
+      (void)BE.CreateMemCpy(FalsePtr, MT.getDestAlign(), MT.getRawSource(),
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    }
+    BE.CreateBr(JoinBB);
+
+    MT.eraseFromParent();
+  }
+};
+
+} // end anonymous namespace
+
+AMDGPUVectorIdiomCombinePass::AMDGPUVectorIdiomCombinePass(unsigned MaxBytes)
+    : MaxBytes(MaxBytes) {}
+
+// Pass driver that locates small, constant-size, non-volatile memcpy calls
+// where source or destination is a select in the same address space. Applies
+// the source/destination transforms described above. Intended to run early to
+// maximize SROA and subsequent optimizations.
+PreservedAnalyses
+AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+
+  if (!AMDGPUVectorIdiomEnable)
+    return PreservedAnalyses::all();
+
+  SmallVector<MemCpyInst *, 8> Worklist;
+  for (Instruction &I : instructions(F)) {
+    if (auto *MC = dyn_cast<MemCpyInst>(&I))
+      Worklist.push_back(MC);
+  }
+
+  bool Changed = false;
+  AMDGPUVectorIdiomImpl Impl(MaxBytes);
+
+  for (MemCpyInst *MT : Worklist) {
+    Value *Dst = MT->getRawDest();
+    Value *Src = MT->getRawSource();
+    if (!isa<SelectInst>(Src) && !isa<SelectInst>(Dst))
+      continue;
+
+    LLVM_DEBUG({
+      Value *DstV = MT->getRawDest();
+      Value *SrcV = MT->getRawSource();
+      unsigned DstAS = cast<PointerType>(DstV->getType())->getAddressSpace();
+      unsigned SrcAS = cast<PointerType>(SrcV->getType())->getAddressSpace();
+      Value *LenV = MT->getLength();
+
+      auto dumpPtrForms = [&](StringRef Label, Value *V) {
+        dbgs() << "      " << Label << ": " << *V << '\n';
+
+        Value *StripCasts = V->stripPointerCasts();
+        if (StripCasts != V)
+          dbgs() << "        - stripCasts: " << *StripCasts << '\n';
+        else
+          dbgs() << "        - stripCasts: (no change)\n";
+
+        Value *Underlying = getUnderlyingObject(V);
+        if (Underlying != V)
+          dbgs() << "        - underlying: " << *Underlying << '\n';
+        else
+          dbgs() << "        - underlying: (no change)\n";
+      };
+
+      auto dumpSelect = [&](StringRef Which, Value *V) {
+        if (auto *SI = dyn_cast<SelectInst>(V)) {
+          dbgs() << "  - " << Which << " is Select: " << *SI << '\n';
+          dbgs() << "      cond: " << *SI->getCondition() << '\n';
+          Value *T = SI->getTrueValue();
+          Value *Fv = SI->getFalseValue();
+          dumpPtrForms("true", T);
+          dumpPtrForms("false", Fv);
+          dbgs() << "      trueIsAlloca=" << (hasAllocaUnderlyingObject(T) ? "true" : "false") << '\n';
+          dbgs() << "      falseIsAlloca=" << (hasAllocaUnderlyingObject(Fv) ? "true" : "false") << '\n';
+        }
+      };
+
+      dbgs() << "[AMDGPUVectorIdiom] Found memcpy: " << *MT << '\n'
+             << "  in function: " << F.getName() << '\n'
+             << "  - volatile=" << (MT->isVolatile() ? "true" : "false") << '\n'
+             << "  - sameAS=" << (DstAS == SrcAS ? "true" : "false")
+             << " (dstAS=" << DstAS << ", srcAS=" << SrcAS << ")\n...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/161200