[llvm] [NVPTX] Improve modeling of inline PTX (PR #130675)

via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 10 14:45:58 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-analysis

Author: Alex MacLean (AlexMaclean)

<details>
<summary>Changes</summary>

Improve the modeling of the memory effects and instruction cost of inline assembly. 

 - MemoryEffects: The CUDA spec states that inline assembly is not assumed to have any side-effects or read or write to memory. An inline assembly may be treated as NoModRef unless it is explictly marked as having side effects or has an explicit memory clobber. https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#incorrect-optimization

   > Normally any memory that is written to will be specified as an out operand, but if there is a hidden read or write on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon.

- InstructionCost: This change implements very rough string parsing system to count the number of instructions in an inline-asm. There are corner cases it will not handle well, but in general this is an improvement over the current cost of the number of arguments plus one. 



---
Full diff: https://github.com/llvm/llvm-project/pull/130675.diff


7 Files Affected:

- (modified) llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp (+28) 
- (modified) llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h (+6) 
- (modified) llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp (+29) 
- (modified) llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h (+4) 
- (added) llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll (+21) 
- (added) llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg (+2) 
- (added) llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll (+47) 


``````````diff
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
index 509b01213cd9c..0cc2132143af1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
@@ -13,6 +13,7 @@
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -115,3 +116,30 @@ ModRefInfo NVPTXAAResult::getModRefInfoMask(const MemoryLocation &Loc,
 
   return ModRefInfo::ModRef;
 }
+
+MemoryEffects NVPTXAAResult::getMemoryEffects(const CallBase *Call,
+                                              AAQueryInfo &AAQI) {
+  // Inline assembly with no side-effect or memory clobbers should not
+  // indirectly access memory in the PTX specification.
+  if (const auto *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
+    // Volatile is translated as side-effects.
+    if (IA->hasSideEffects())
+      return MemoryEffects::unknown();
+
+    for (const InlineAsm::ConstraintInfo &Constraint : IA->ParseConstraints()) {
+      // Indirect constraints (e.g. =*m) are unsupported in inline PTX.
+      if (Constraint.isIndirect)
+        return MemoryEffects::unknown();
+
+      // Memory clobbers prevent optimization.
+      if (!(Constraint.Type & InlineAsm::ConstraintPrefix::isClobber))
+        continue;
+      for (const std::string &Code : Constraint.Codes)
+        if (Code == "{memory}")
+          return MemoryEffects::unknown();
+    }
+    return MemoryEffects::none();
+  }
+
+  return MemoryEffects::unknown();
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
index 2d204979eb6ce..cfbf5dee3ec50 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
@@ -36,6 +36,12 @@ class NVPTXAAResult : public AAResultBase {
 
   ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, AAQueryInfo &AAQI,
                                bool IgnoreLocals);
+
+  MemoryEffects getMemoryEffects(const CallBase *Call, AAQueryInfo &AAQI);
+
+  MemoryEffects getMemoryEffects(const Function *F) {
+    return MemoryEffects::unknown();
+  }
 };
 
 /// Analysis pass providing a never-invalidated alias analysis result.
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 51c679b8ad89c..4d12d15b1d80b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "NVPTXTargetTransformInfo.h"
 #include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -483,6 +484,34 @@ NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   return std::nullopt;
 }
 
+InstructionCost
+NVPTXTTIImpl::getInstructionCost(const User *U,
+                                 ArrayRef<const Value *> Operands,
+                                 TTI::TargetCostKind CostKind) {
+  if (const auto *CI = dyn_cast<CallInst>(U))
+    if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
+      // Without this implementation getCallCost() would return the number
+      // of arguments+1 as the cost. Because the cost-model assumes it is a call
+      // since it is classified as a call in the IR. A better cost model would
+      // be to return the number of asm instructions embedded in the asm
+      // string.
+      auto &AsmStr = IA->getAsmString();
+      SmallVector<StringRef, 4> AsmPieces;
+      SplitString(AsmStr, AsmPieces, ";\n");
+
+      const unsigned InstCount = count_if(AsmPieces, [](StringRef AsmInst) {
+        AsmInst = AsmInst.trim();
+        // This is pretty course but does a reasonably good job of identifying
+        // things that look like instructions, possibly with a predicate ("@").
+        return !AsmInst.empty() && (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
+                                    AsmInst.find(".pragma") != StringRef::npos);
+      });
+      return InstCount * TargetTransformInfo::TCC_Basic;
+    }
+
+  return BaseT::getInstructionCost(U, Operands, CostKind);
+}
+
 InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 7f69d422e8b4b..6db36e958b28c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -94,6 +94,10 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
   // calls are particularly expensive in NVPTX.
   unsigned getInliningThresholdMultiplier() const { return 11; }
 
+  InstructionCost getInstructionCost(const User *U,
+                                     ArrayRef<const Value *> Operands,
+                                     TTI::TargetCostKind CostKind);
+
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll b/llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll
new file mode 100644
index 0000000000000..600e3b5d537c9
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test1() {
+; CHECK-LABEL: 'test1'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.000000e+00)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.0)
+  %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32   $0, $2, $3, $4;\0A\09madc.hi.u32     $1, $2, $3,  0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
+  %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
+  %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
+  call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg b/llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg
new file mode 100644
index 0000000000000..0d37b86e1c8e6
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "NVPTX" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll b/llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll
new file mode 100644
index 0000000000000..b03fae365f264
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll
@@ -0,0 +1,47 @@
+; RUN: opt -passes=aa-eval -aa-pipeline=nvptx-aa -print-all-alias-modref-info < %s -disable-output 2>&1 \
+; RUN:   | FileCheck %s --check-prefixes CHECK-ALIAS
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+;;CHECK-ALIAS-LABEL: Function: test_sideeffect
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> call
+define void @test_sideeffect(ptr %out) {
+entry:
+  %0 = addrspacecast ptr %out to ptr addrspace(1)
+  call void asm sideeffect "membar.gl;", ""()
+  store i32 5, ptr addrspace(1) %0, align 4
+  ret void
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_indirect
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
+define i32 @test_indirect(ptr %out) {
+entry:
+  %0 = addrspacecast ptr %out to ptr addrspace(1)
+  store i32 0, ptr addrspace(1) %0, align 4
+  %1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,*m"(ptr addrspace(1) elementtype(i32) %0)
+  store i32 0, ptr addrspace(1) %0, align 4
+  ret i32 %1
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_memory
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
+define i32 @test_memory(ptr %out) {
+entry:
+  %0 = addrspacecast ptr %out to ptr addrspace(1)
+  store i32 0, ptr addrspace(1) %0, align 4
+  %1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,l,~{memory}"(ptr addrspace(1) %0)
+  store i32 0, ptr addrspace(1) %0, align 4
+  ret i32 %1
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_no_sideeffect
+;;CHECK-ALIAS: NoModRef: Ptr: i32* %0 <-> %1 = call
+define void @test_no_sideeffect(ptr %in, ptr %out) {
+entry:
+  %0 = addrspacecast ptr %out to ptr addrspace(1)
+  %1 = call i32 asm "cvt.u32.u64 $0, $1;", "=r,l"(ptr %in)
+  store i32 %1, ptr addrspace(1) %0, align 4
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/130675


More information about the llvm-commits mailing list