[llvm] fd3a6b6 - [NVPTX] Improve modeling of inline PTX (#130675)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 13:46:19 PDT 2025
Author: Alex MacLean
Date: 2025-03-25T13:46:16-07:00
New Revision: fd3a6b6005aa591a53fc01e4ed130af369e0c366
URL: https://github.com/llvm/llvm-project/commit/fd3a6b6005aa591a53fc01e4ed130af369e0c366
DIFF: https://github.com/llvm/llvm-project/commit/fd3a6b6005aa591a53fc01e4ed130af369e0c366.diff
LOG: [NVPTX] Improve modeling of inline PTX (#130675)
Improve the modeling of the memory effects and instruction cost of
inline assembly.
- MemoryEffects: The CUDA spec states that inline assembly is not
assumed to have any side-effects or read or write to memory. An inline
assembly may be treated as NoModRef unless it is explictly marked as
having side effects or has an explicit memory clobber.
https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#incorrect-optimization
> Normally any memory that is written to will be specified as an out
operand, but if there is a hidden read or write on user memory (for
example, indirect access of a memory location via an operand), or if you
want to stop any memory optimizations around the asm() statement
performed during generation of PTX, you can add a “memory” clobbers
specification after a 3rd colon.
- InstructionCost: This change implements very rough string parsing
system to count the number of instructions in an inline-asm. There are
corner cases it will not handle well, but in general this is an
improvement over the current cost of the number of arguments plus one.
Added:
llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll
llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg
llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll
Modified:
llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
index 509b01213cd9c..1f770893828e2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
@@ -13,6 +13,7 @@
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/CommandLine.h"
@@ -115,3 +116,29 @@ ModRefInfo NVPTXAAResult::getModRefInfoMask(const MemoryLocation &Loc,
return ModRefInfo::ModRef;
}
+
+MemoryEffects NVPTXAAResult::getMemoryEffects(const CallBase *Call,
+ AAQueryInfo &AAQI) {
+ // Inline assembly with no side-effect or memory clobbers should not
+ // indirectly access memory in the PTX specification.
+ if (const auto *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
+ // Volatile is translated as side-effects.
+ if (IA->hasSideEffects())
+ return MemoryEffects::unknown();
+
+ for (const InlineAsm::ConstraintInfo &Constraint : IA->ParseConstraints()) {
+ // Indirect constraints (e.g. =*m) are unsupported in inline PTX.
+ if (Constraint.isIndirect)
+ return MemoryEffects::unknown();
+
+ // Memory clobbers prevent optimization.
+ if ((Constraint.Type & InlineAsm::ConstraintPrefix::isClobber) &&
+ any_of(Constraint.Codes,
+ [](const auto &Code) { return Code == "{memory}"; }))
+ return MemoryEffects::unknown();
+ }
+ return MemoryEffects::none();
+ }
+
+ return MemoryEffects::unknown();
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
index 2d204979eb6ce..cfbf5dee3ec50 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
@@ -36,6 +36,12 @@ class NVPTXAAResult : public AAResultBase {
ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, AAQueryInfo &AAQI,
bool IgnoreLocals);
+
+ MemoryEffects getMemoryEffects(const CallBase *Call, AAQueryInfo &AAQI);
+
+ MemoryEffects getMemoryEffects(const Function *F) {
+ return MemoryEffects::unknown();
+ }
};
/// Analysis pass providing a never-invalidated alias analysis result.
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 51c679b8ad89c..a89ca3037c7ff 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -8,6 +8,7 @@
#include "NVPTXTargetTransformInfo.h"
#include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -483,6 +484,35 @@ NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return std::nullopt;
}
+InstructionCost
+NVPTXTTIImpl::getInstructionCost(const User *U,
+ ArrayRef<const Value *> Operands,
+ TTI::TargetCostKind CostKind) {
+ if (const auto *CI = dyn_cast<CallInst>(U))
+ if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
+ // Without this implementation getCallCost() would return the number
+ // of arguments+1 as the cost. Because the cost-model assumes it is a call
+ // since it is classified as a call in the IR. A better cost model would
+ // be to return the number of asm instructions embedded in the asm
+ // string.
+ auto &AsmStr = IA->getAsmString();
+ const unsigned InstCount =
+ count_if(split(AsmStr, ';'), [](StringRef AsmInst) {
+ // Trim off scopes denoted by '{' and '}' as these can be ignored
+ AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");
+ // This is pretty coarse but does a reasonably good job of
+ // identifying things that look like instructions, possibly with a
+ // predicate ("@").
+ return !AsmInst.empty() &&
+ (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
+ AsmInst.find(".pragma") != StringRef::npos);
+ });
+ return InstCount * TargetTransformInfo::TCC_Basic;
+ }
+
+ return BaseT::getInstructionCost(U, Operands, CostKind);
+}
+
InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 7f69d422e8b4b..6db36e958b28c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -94,6 +94,10 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
// calls are particularly expensive in NVPTX.
unsigned getInliningThresholdMultiplier() const { return 11; }
+ InstructionCost getInstructionCost(const User *U,
+ ArrayRef<const Value *> Operands,
+ TTI::TargetCostKind CostKind);
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll b/llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll
new file mode 100644
index 0000000000000..600e3b5d537c9
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test1() {
+; CHECK-LABEL: 'test1'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.000000e+00)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.0)
+ %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
+ %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
+ %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
+ call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
+ ret void
+}
diff --git a/llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg b/llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg
new file mode 100644
index 0000000000000..0d37b86e1c8e6
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "NVPTX" in config.root.targets:
+ config.unsupported = True
diff --git a/llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll b/llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll
new file mode 100644
index 0000000000000..f1e3a93ca9d84
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll
@@ -0,0 +1,47 @@
+; RUN: opt -passes=aa-eval -aa-pipeline=nvptx-aa,basic-aa -print-all-alias-modref-info < %s -disable-output 2>&1 \
+; RUN: | FileCheck %s --check-prefixes CHECK-ALIAS
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+;;CHECK-ALIAS-LABEL: Function: test_sideeffect
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> call
+define void @test_sideeffect(ptr %out) {
+entry:
+ %0 = addrspacecast ptr %out to ptr addrspace(1)
+ call void asm sideeffect "membar.gl;", ""()
+ store i32 5, ptr addrspace(1) %0, align 4
+ ret void
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_indirect
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
+define i32 @test_indirect(ptr %out) {
+entry:
+ %0 = addrspacecast ptr %out to ptr addrspace(1)
+ store i32 0, ptr addrspace(1) %0, align 4
+ %1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,*m"(ptr addrspace(1) elementtype(i32) %0)
+ store i32 0, ptr addrspace(1) %0, align 4
+ ret i32 %1
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_memory
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
+define i32 @test_memory(ptr %out) {
+entry:
+ %0 = addrspacecast ptr %out to ptr addrspace(1)
+ store i32 0, ptr addrspace(1) %0, align 4
+ %1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,l,~{memory}"(ptr addrspace(1) %0)
+ store i32 0, ptr addrspace(1) %0, align 4
+ ret i32 %1
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_no_sideeffect
+;;CHECK-ALIAS: NoModRef: Ptr: i32* %0 <-> %1 = call
+define void @test_no_sideeffect(ptr %in, ptr %out) {
+entry:
+ %0 = addrspacecast ptr %out to ptr addrspace(1)
+ %1 = call i32 asm "cvt.u32.u64 $0, $1;", "=r,l"(ptr %in)
+ store i32 %1, ptr addrspace(1) %0, align 4
+ ret void
+}
More information about the llvm-commits
mailing list