[llvm] [AMDGPU] Identify vector idiom to unlock SROA (PR #156791)

Wed Sep 10 07:53:41 PDT 2025

================
@@ -0,0 +1,317 @@
+//===- AMDGPUVectorIdiom.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AMDGPU-specific vector idiom canonicalizations to unblock SROA and
+// subsequent scalarization/vectorization.
+//
+// Motivation:
+// - HIP vector types are often modeled as structs and copied with memcpy.
+//   Address-level selects on such copies block SROA. Converting to value-level
+//   operations or splitting the CFG enables SROA to break aggregates, which
+//   unlocks scalarization/vectorization on AMDGPU.
+//
+// Example pattern:
+//   %src = select i1 %c, ptr %A, ptr %B
+//   call void @llvm.memcpy(ptr %dst, ptr %src, i32 16, i1 false)
+//
+// Objectives:
+// - Canonicalize small memcpy patterns where source or destination is a select
+// of pointers.
+// - Prefer value-level selects (on loaded values) over address-level selects
+// when safe.
+// - When speculation is unsafe, split the CFG to isolate each arm.
+//
+// Assumptions:
+// - Only handles non-volatile memcpy with constant length N where 0 < N <=
+// MaxBytes (default 32).
+// - Source and destination must be in the same address space.
+// - Speculative loads are allowed only if a conservative alignment check
+// passes.
+// - No speculative stores are introduced.
+//
+// Transformations:
+// - Source-select memcpy: attempt speculative loads -> value select -> single
+// store.
+//   Fallback is CFG split with two memcpy calls.
+// - Destination-select memcpy: always CFG split to avoid speculative stores.
+//
+// Run this pass early, before SROA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUVectorIdiom.h"
+#include "AMDGPU.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "amdgpu-vector-idiom"
+
+namespace {
+
+// Default to 32 bytes since the largest HIP vector types are double4 or long4.
+static cl::opt<unsigned> AMDGPUVectorIdiomMaxBytes(
+    "amdgpu-vector-idiom-max-bytes",
+    cl::desc("Max memcpy size (in bytes) to transform in AMDGPUVectorIdiom "
+             "(default 32)"),
+    cl::init(32));
+
+// Selects an integer or integer-vector element type matching NBytes, using the
+// minimum proven alignment to decide the widest safe element width.
+// Assumptions:
+// - Pointee types are opaque; the element choice is based solely on size and
+// alignment.
+// - Falls back to <N x i8> if wider lanes are not safe/aligned.
+static Type *getIntOrVecTypeForSize(uint64_t NBytes, LLVMContext &Ctx,
+                                    Align MinProvenAlign = Align(1)) {
+  auto CanUseI64 = [&]() { return MinProvenAlign >= Align(8); };
+  auto CanUseI32 = [&]() { return MinProvenAlign >= Align(4); };
+  auto CanUseI16 = [&]() { return MinProvenAlign >= Align(2); };
+
+  if (NBytes == 32 && CanUseI64())
+    return FixedVectorType::get(Type::getInt64Ty(Ctx), 4);
+
+  if ((NBytes % 4) == 0 && CanUseI32())
+    return FixedVectorType::get(Type::getInt32Ty(Ctx), NBytes / 4);
+
+  if ((NBytes % 2) == 0 && CanUseI16())
+    return FixedVectorType::get(Type::getInt16Ty(Ctx), NBytes / 2);
+
+  return FixedVectorType::get(Type::getInt8Ty(Ctx), NBytes);
+}
+
+static Align minAlign(Align A, Align B) { return A < B ? A : B; }
+
+// Checks if both pointer operands can be speculatively loaded for N bytes and
+// computes the minimum alignment to use.
+// Notes:
+// - Intentionally conservative: relies on getOrEnforceKnownAlignment.
+// - AA/TLI are not used for deeper reasoning here.
+static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, Align &OutAlign,
+                                         const DataLayout &DL,
+                                         AssumptionCache *AC,
+                                         const DominatorTree *DT) {
+  Align AlignA =
+      llvm::getOrEnforceKnownAlignment(A, Align(1), DL, nullptr, AC, DT);
+  Align AlignB =
+      llvm::getOrEnforceKnownAlignment(B, Align(1), DL, nullptr, AC, DT);
+
+  if (AlignA.value() < 1 || AlignB.value() < 1)
+    return false;
+
+  OutAlign = minAlign(AlignA, AlignB);
+  return true;
+}
+
+// With opaque pointers, ensure address spaces match and otherwise return Ptr.
+// Assumes the address space is the only property to validate for this cast.
+static Value *castPtrTo(Value *Ptr, unsigned ExpectedAS) {
+  auto *FromPTy = cast<PointerType>(Ptr->getType());
+  unsigned AS = FromPTy->getAddressSpace();
+  (void)ExpectedAS;
+  assert(AS == ExpectedAS && "Address space mismatch for castPtrTo");
+  return Ptr;
+}
+
+struct AMDGPUVectorIdiomImpl {
+  unsigned MaxBytes;
+
+  AMDGPUVectorIdiomImpl(unsigned MaxBytes) : MaxBytes(MaxBytes) {}
+
+  // Rewrites memcpy when the source is a select of pointers. Prefers a
+  // value-level select (two loads + select + one store) if speculative loads
+  // are safe. Otherwise, falls back to a guarded CFG split with two memcpy
+  // calls. Assumptions:
+  // - Non-volatile, constant length, within MaxBytes.
+  // - Source and destination in the same address space.
+  bool transformSelectMemcpySource(MemTransferInst &MT, SelectInst &Sel,
+                                   const DataLayout &DL,
+                                   const DominatorTree *DT,
+                                   AssumptionCache *AC) {
+    IRBuilder<> B(&MT);
+    Value *Dst = MT.getRawDest();
+    Value *A = Sel.getTrueValue();
+    Value *Bv = Sel.getFalseValue();
+    if (!A->getType()->isPointerTy() || !Bv->getType()->isPointerTy())
+      return false;
+
+    ConstantInt *LenCI = cast<ConstantInt>(MT.getLength());
+    uint64_t N = LenCI->getLimitedValue();
+
+    Align DstAlign = MaybeAlign(MT.getDestAlign()).valueOrOne();
+    Align AlignAB;
+    bool CanSpeculate =
+        bothArmsSafeToSpeculateLoads(A, Bv, AlignAB, DL, AC, DT);
+
+    unsigned AS = cast<PointerType>(A->getType())->getAddressSpace();
+    assert(AS == cast<PointerType>(Bv->getType())->getAddressSpace() &&
+           "Expected same AS");
+
+    if (CanSpeculate) {
+      Align MinAlign = std::min(AlignAB, DstAlign);
+      Type *Ty = getIntOrVecTypeForSize(N, B.getContext(), MinAlign);
+
+      Value *PA = castPtrTo(A, AS);
+      Value *PB = castPtrTo(Bv, AS);
+      LoadInst *LA = B.CreateAlignedLoad(Ty, PA, MinAlign);
+      LoadInst *LB = B.CreateAlignedLoad(Ty, PB, MinAlign);
+      Value *V = B.CreateSelect(Sel.getCondition(), LA, LB);
+
+      Value *PDst =
+          castPtrTo(Dst, cast<PointerType>(Dst->getType())->getAddressSpace());
+      (void)B.CreateAlignedStore(V, PDst, DstAlign);
+
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) to "
+                           "value-select loads/stores: "
+                        << MT << "\n");
+      MT.eraseFromParent();
+      return true;
+    }
+
+    splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true);
+    LLVM_DEBUG(
+        dbgs()
+        << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) by CFG split\n");
+    return true;
+  }
+
+  // Rewrites memcpy when the destination is a select of pointers. To avoid
+  // speculative stores, always splits the CFG and emits a memcpy per branch.
+  // Assumptions mirror the source case.
+  bool transformSelectMemcpyDest(MemTransferInst &MT, SelectInst &Sel) {
+    Value *DA = Sel.getTrueValue();
+    Value *DB = Sel.getFalseValue();
+    if (!DA->getType()->isPointerTy() || !DB->getType()->isPointerTy())
+      return false;
+
+    splitCFGForMemcpy(MT, Sel.getCondition(), DA, DB, false);
+    LLVM_DEBUG(
+        dbgs()
+        << "[AMDGPUVectorIdiom] Rewrote memcpy(select-dst) by CFG split\n");
+    return true;
+  }
+
+  // Splits the CFG around a memcpy whose source or destination depends on a
+  // condition. Clones memcpy in then/else using TruePtr/FalsePtr and rejoins.
+  // Assumptions:
+  // - MT has constant length and is non-volatile.
+  // - TruePtr/FalsePtr are correct replacements for the selected operand.
+  void splitCFGForMemcpy(MemTransferInst &MT, Value *Cond, Value *TruePtr,
+                         Value *FalsePtr, bool IsSource) {
+    Function *F = MT.getFunction();
+    BasicBlock *Cur = MT.getParent();
+    BasicBlock *ThenBB = BasicBlock::Create(F->getContext(), "memcpy.then", F);
+    BasicBlock *ElseBB = BasicBlock::Create(F->getContext(), "memcpy.else", F);
+    BasicBlock *JoinBB =
+        Cur->splitBasicBlock(BasicBlock::iterator(&MT), "memcpy.join");
+
+    Cur->getTerminator()->eraseFromParent();
+    IRBuilder<> B(Cur);
+    B.CreateCondBr(Cond, ThenBB, ElseBB);
+
+    ConstantInt *LenCI = cast<ConstantInt>(MT.getLength());
+
+    IRBuilder<> BT(ThenBB);
+    if (IsSource) {
+      (void)BT.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), TruePtr,
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    } else {
+      (void)BT.CreateMemCpy(TruePtr, MT.getDestAlign(), MT.getRawSource(),
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    }
+    BT.CreateBr(JoinBB);
+
+    IRBuilder<> BE(ElseBB);
+    if (IsSource) {
+      (void)BE.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), FalsePtr,
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    } else {
+      (void)BE.CreateMemCpy(FalsePtr, MT.getDestAlign(), MT.getRawSource(),
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    }
+    BE.CreateBr(JoinBB);
+
+    MT.eraseFromParent();
+  }
+};
+
+} // end anonymous namespace
+
+AMDGPUVectorIdiomCombinePass::AMDGPUVectorIdiomCombinePass()
+    : MaxBytes(AMDGPUVectorIdiomMaxBytes) {}
+
+// Pass driver that locates small, constant-size, non-volatile memcpy calls
+// where source or destination is a select in the same address space. Applies
+// the source/destination transforms described above. Intended to run early to
+// maximize SROA and subsequent optimizations.
+PreservedAnalyses
+AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+
+  SmallVector<CallInst *, 8> Worklist;
+  for (Instruction &I : instructions(F)) {
----------------
yxsamliu wrote:

I tried using make_early_inc_range, but it causes an assertion failure because splitCFGForMemcpy calls splitBasicBlock, which moves instructions to a new block and invalidates the iterator. Since we're modifying the CFG structure, we need to collect the memcpy instructions first. I've optimized the worklist to filter more aggressively during collection.

https://github.com/llvm/llvm-project/pull/156791