[llvm] Add Dead Block Elimination to NVVMReflect (PR #144171)

Mon Jun 30 00:37:01 PDT 2025

https://github.com/YonahGoldberg updated https://github.com/llvm/llvm-project/pull/144171

>From e9c4b6b2a9f4b1a6627181554c9d2ce5578764a7 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 13 Jun 2025 22:20:23 +0000
Subject: [PATCH 01/10] updates

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp         | 113 +++++++++++++++---
 .../test/CodeGen/NVPTX/nvvm-reflect-opaque.ll |   4 +-
 llvm/test/CodeGen/NVPTX/nvvm-reflect.ll       |   4 +-
 3 files changed, 100 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 208bab52284a3..1c17852503660 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -39,6 +39,8 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
 #define NVVM_REFLECT_OCL_FUNCTION "__nvvm_reflect_ocl"
 // Argument of reflect call to retrive arch number
@@ -59,7 +61,10 @@ class NVVMReflect {
   StringMap<unsigned> ReflectMap;
   bool handleReflectFunction(Module &M, StringRef ReflectName);
   void populateReflectMap(Module &M);
-  void foldReflectCall(CallInst *Call, Constant *NewValue);
+  void replaceReflectCalls(
+      SmallVector<std::pair<CallInst *, Constant *>, 8> &ReflectReplacements,
+      const DataLayout &DL);
+  SetVector<BasicBlock *> findTransitivelyDeadBlocks(BasicBlock *DeadBB);
 
 public:
   // __CUDA_FTZ is assigned in `runOnModule` by checking nvvm-reflect-ftz module
@@ -138,6 +143,8 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
   assert(F->getReturnType()->isIntegerTy() &&
          "_reflect's return type should be integer");
 
+  SmallVector<std::pair<CallInst *, Constant *>, 8> ReflectReplacements;
+
   const bool Changed = !F->use_empty();
   for (User *U : make_early_inc_range(F->users())) {
     // Reflect function calls look like:
@@ -178,38 +185,110 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
                       << "(" << ReflectArg << ") with value " << ReflectVal
                       << "\n");
     auto *NewValue = ConstantInt::get(Call->getType(), ReflectVal);
-    foldReflectCall(Call, NewValue);
-    Call->eraseFromParent();
+    ReflectReplacements.push_back({Call, NewValue});
   }
 
-  // Remove the __nvvm_reflect function from the module
+  replaceReflectCalls(ReflectReplacements, M.getDataLayout());
   F->eraseFromParent();
   return Changed;
 }
 
-void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
+/// Find all blocks that become dead transitively from an initial dead block.
+/// Returns the complete set including the original dead block and any blocks
+/// that lose all their predecessors due to the deletion cascade.
+SetVector<BasicBlock *>
+NVVMReflect::findTransitivelyDeadBlocks(BasicBlock *DeadBB) {
+  SmallVector<BasicBlock *, 8> Worklist({DeadBB});
+  SetVector<BasicBlock *> DeadBlocks;
+  while (!Worklist.empty()) {
+    auto *BB = Worklist.pop_back_val();
+    DeadBlocks.insert(BB);
+
+    for (BasicBlock *Succ : successors(BB))
+      if (pred_size(Succ) == 1 && DeadBlocks.insert(Succ))
+        Worklist.push_back(Succ);
+  }
+  return DeadBlocks;
+}
+
+/// Replace calls to __nvvm_reflect with corresponding constant values. Then
+/// clean up through constant folding and propagation and dead block
+/// elimination.
+///
+/// The purpose of this cleanup is not optimization because that could be
+/// handled by later passes
+/// (i.e. SCCP, SimplifyCFG, etc.), but for correctness. Reflect calls are most
+/// commonly used to query the arch number and select a valid instruction for
+/// the arch. Therefore, you need to eliminate blocks that become dead because
+/// they may contain invalid instructions for the arch. The purpose of the
+/// cleanup is to do the minimal amount of work to leave the code in a valid
+/// state.
+void NVVMReflect::replaceReflectCalls(
+    SmallVector<std::pair<CallInst *, Constant *>, 8> &ReflectReplacements,
+    const DataLayout &DL) {
   SmallVector<Instruction *, 8> Worklist;
-  // Replace an instruction with a constant and add all users of the instruction
-  // to the worklist
+  SetVector<BasicBlock *> DeadBlocks;
+
+  // Replace an instruction with a constant and add all users to the worklist,
+  // then delete the instruction
   auto ReplaceInstructionWithConst = [&](Instruction *I, Constant *C) {
     for (auto *U : I->users())
       if (auto *UI = dyn_cast<Instruction>(U))
         Worklist.push_back(UI);
     I->replaceAllUsesWith(C);
+    I->eraseFromParent();
   };
 
-  ReplaceInstructionWithConst(Call, NewValue);
+  for (auto &[Call, NewValue] : ReflectReplacements)
+    ReplaceInstructionWithConst(Call, NewValue);
 
-  auto &DL = Call->getModule()->getDataLayout();
-  while (!Worklist.empty()) {
-    auto *I = Worklist.pop_back_val();
-    if (auto *C = ConstantFoldInstruction(I, DL)) {
-      ReplaceInstructionWithConst(I, C);
-      if (isInstructionTriviallyDead(I))
-        I->eraseFromParent();
-    } else if (I->isTerminator()) {
-      ConstantFoldTerminator(I->getParent());
+  // Alternate between constant folding/propagation and dead block elimination.
+  // Terminator folding may create new dead blocks. When those dead blocks are
+  // deleted, their live successors may have PHIs that can be simplified, which
+  // may yield more work for folding/propagation.
+  while (true) {
+    // Iterate folding and propagating constants until the worklist is empty.
+    while (!Worklist.empty()) {
+      auto *I = Worklist.pop_back_val();
+      if (auto *C = ConstantFoldInstruction(I, DL)) {
+        ReplaceInstructionWithConst(I, C);
+      } else if (I->isTerminator()) {
+        BasicBlock *BB = I->getParent();
+        SmallVector<BasicBlock *, 8> Succs(successors(BB));
+        // Some blocks may become dead if the terminator is folded because
+        // a conditional branch is turned into a direct branch.
+        if (ConstantFoldTerminator(BB)) {
+          for (BasicBlock *Succ : Succs) {
+            if (pred_empty(Succ) &&
+                Succ != &Succ->getParent()->getEntryBlock()) {
+              SetVector<BasicBlock *> TransitivelyDead =
+                  findTransitivelyDeadBlocks(Succ);
+              DeadBlocks.insert(TransitivelyDead.begin(),
+                                TransitivelyDead.end());
+            }
+          }
+        }
+      }
     }
+    // No more constants to fold and no more dead blocks
+    // to create more work. We're done.
+    if (DeadBlocks.empty())
+      break;
+    // PHI nodes of live successors of dead blocks get eliminated when the dead
+    // blocks are eliminated. Their users can now be simplified further, so add
+    // them to the worklist.
+    for (BasicBlock *DeadBB : DeadBlocks)
+      for (BasicBlock *Succ : successors(DeadBB))
+        if (!DeadBlocks.contains(Succ))
+          for (PHINode &PHI : Succ->phis())
+            for (auto *U : PHI.users())
+              if (auto *UI = dyn_cast<Instruction>(U))
+                Worklist.push_back(UI);
+    // Delete all dead blocks in order
+    for (BasicBlock *DeadBB : DeadBlocks)
+      DeleteDeadBlock(DeadBB);
+
+    DeadBlocks.clear();
   }
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
index 19c74df303702..7bb1af707001a 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
index 244b44fea9b83..581dbf353c1ff 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"

>From d5e1cfc8bfaaf736493743d17af464a743c34cb2 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 13 Jun 2025 22:20:31 +0000
Subject: [PATCH 02/10] format

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 1c17852503660..5b24864ab586f 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -19,6 +19,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -39,8 +41,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetVector.h"
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
 #define NVVM_REFLECT_OCL_FUNCTION "__nvvm_reflect_ocl"
 // Argument of reflect call to retrive arch number

>From 2c951c83fd46a0d046dd72fa397de678d0c834bd Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 13 Jun 2025 22:23:10 +0000
Subject: [PATCH 03/10] format

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 5b24864ab586f..b0f69598972ce 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -259,12 +259,9 @@ void NVVMReflect::replaceReflectCalls(
         // a conditional branch is turned into a direct branch.
         if (ConstantFoldTerminator(BB)) {
           for (BasicBlock *Succ : Succs) {
-            if (pred_empty(Succ) &&
-                Succ != &Succ->getParent()->getEntryBlock()) {
-              SetVector<BasicBlock *> TransitivelyDead =
-                  findTransitivelyDeadBlocks(Succ);
-              DeadBlocks.insert(TransitivelyDead.begin(),
-                                TransitivelyDead.end());
+            if (pred_empty(Succ) && Succ != &Succ->getParent()->getEntryBlock()) {
+              SetVector<BasicBlock *> TransitivelyDead = findTransitivelyDeadBlocks(Succ);
+              DeadBlocks.insert(TransitivelyDead.begin(), TransitivelyDead.end());
             }
           }
         }
@@ -284,7 +281,7 @@ void NVVMReflect::replaceReflectCalls(
             for (auto *U : PHI.users())
               if (auto *UI = dyn_cast<Instruction>(U))
                 Worklist.push_back(UI);
-    // Delete all dead blocks in order
+    // Delete all dead blocks
     for (BasicBlock *DeadBB : DeadBlocks)
       DeleteDeadBlock(DeadBB);
 

>From 8fbef2a34339f66e7aaeab113372edde945c1fb1 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 13 Jun 2025 22:23:15 +0000
Subject: [PATCH 04/10] format

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index b0f69598972ce..74c3efd18ad89 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -259,9 +259,12 @@ void NVVMReflect::replaceReflectCalls(
         // a conditional branch is turned into a direct branch.
         if (ConstantFoldTerminator(BB)) {
           for (BasicBlock *Succ : Succs) {
-            if (pred_empty(Succ) && Succ != &Succ->getParent()->getEntryBlock()) {
-              SetVector<BasicBlock *> TransitivelyDead = findTransitivelyDeadBlocks(Succ);
-              DeadBlocks.insert(TransitivelyDead.begin(), TransitivelyDead.end());
+            if (pred_empty(Succ) &&
+                Succ != &Succ->getParent()->getEntryBlock()) {
+              SetVector<BasicBlock *> TransitivelyDead =
+                  findTransitivelyDeadBlocks(Succ);
+              DeadBlocks.insert(TransitivelyDead.begin(),
+                                TransitivelyDead.end());
             }
           }
         }

>From f9eedaadebddf6cc60cf3d6a5424e30bfdda7fb5 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 13 Jun 2025 22:30:54 +0000
Subject: [PATCH 05/10] cleanup

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 74c3efd18ad89..fd9225838b243 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -19,7 +19,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"

>From 4ba9826586f37cc6f3e0d5eb3261b5132e69c003 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 13 Jun 2025 22:33:19 +0000
Subject: [PATCH 06/10] added back isInstructionTriviallyDead check

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index fd9225838b243..2585ff45bde4c 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -235,7 +235,8 @@ void NVVMReflect::replaceReflectCalls(
       if (auto *UI = dyn_cast<Instruction>(U))
         Worklist.push_back(UI);
     I->replaceAllUsesWith(C);
-    I->eraseFromParent();
+    if (isInstructionTriviallyDead(I))
+      I->eraseFromParent();
   };
 
   for (auto &[Call, NewValue] : ReflectReplacements)

>From 32c5c408951acee7e101e95ae5f3d93a79921c4f Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 30 Jun 2025 07:15:47 +0000
Subject: [PATCH 07/10] NVVMReflectDCE option

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 2585ff45bde4c..c76eaf7e8d1f3 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -88,7 +88,7 @@ ModulePass *llvm::createNVVMReflectPass(unsigned SmVersion) {
 }
 
 static cl::opt<bool>
-    NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
+    NVVMReflectEnabled("nvvm-reflect-enable", cl::init(false), cl::Hidden,
                        cl::desc("NVVM reflection, enabled by default"));
 
 char NVVMReflectLegacyPass::ID = 0;
@@ -105,6 +105,10 @@ static cl::list<std::string> ReflectList(
     cl::desc("A key=value pair. Replace __nvvm_reflect(name) with value."),
     cl::ValueRequired);
 
+static cl::opt<bool> NVVMReflectDCE("nvvm-reflect-dce", cl::init(false),
+                                    cl::Hidden,
+                                    cl::desc("Delete dead blocks introduced by reflect call elimination"));
+
 // Set the ReflectMap with, first, the value of __CUDA_FTZ from module metadata,
 // and then the key/value pairs from the command line.
 void NVVMReflect::populateReflectMap(Module &M) {
@@ -241,8 +245,9 @@ void NVVMReflect::replaceReflectCalls(
 
   for (auto &[Call, NewValue] : ReflectReplacements)
     ReplaceInstructionWithConst(Call, NewValue);
-
-  // Alternate between constant folding/propagation and dead block elimination.
+  
+  // Constant fold reflect results. If NVVMReflectDCE is enabled, we will
+  // alternate between constant folding/propagation and dead block elimination.
   // Terminator folding may create new dead blocks. When those dead blocks are
   // deleted, their live successors may have PHIs that can be simplified, which
   // may yield more work for folding/propagation.
@@ -256,11 +261,12 @@ void NVVMReflect::replaceReflectCalls(
         BasicBlock *BB = I->getParent();
         SmallVector<BasicBlock *, 8> Succs(successors(BB));
         // Some blocks may become dead if the terminator is folded because
-        // a conditional branch is turned into a direct branch.
+        // a conditional branch is turned into a direct branch. Add those dead blocks
+        // to the dead blocks set if NVVMReflectDCE is enabled.
         if (ConstantFoldTerminator(BB)) {
           for (BasicBlock *Succ : Succs) {
             if (pred_empty(Succ) &&
-                Succ != &Succ->getParent()->getEntryBlock()) {
+                Succ != &Succ->getParent()->getEntryBlock() && NVVMReflectDCE) {
               SetVector<BasicBlock *> TransitivelyDead =
                   findTransitivelyDeadBlocks(Succ);
               DeadBlocks.insert(TransitivelyDead.begin(),

>From d97ad29d87e680c8604a87f7f672a91c1876cc1a Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 30 Jun 2025 07:17:54 +0000
Subject: [PATCH 08/10] use reflect dce instead of simplifycfg

---
 llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll | 4 ++--
 llvm/test/CodeGen/NVPTX/nvvm-reflect.ll        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
index 7bb1af707001a..553b2c107d86a 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' -nvvm-reflect-dce \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' -nvvm-reflect-dce \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
index 581dbf353c1ff..86cdc3f489c2e 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' -nvvm-reflect-dce \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' -nvvm-reflect-dce \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"

>From ed7cfc10e0c71499c34faef818d5d9bda1763591 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 30 Jun 2025 07:18:29 +0000
Subject: [PATCH 09/10] format

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index c76eaf7e8d1f3..093c55a9fb027 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -105,9 +105,9 @@ static cl::list<std::string> ReflectList(
     cl::desc("A key=value pair. Replace __nvvm_reflect(name) with value."),
     cl::ValueRequired);
 
-static cl::opt<bool> NVVMReflectDCE("nvvm-reflect-dce", cl::init(false),
-                                    cl::Hidden,
-                                    cl::desc("Delete dead blocks introduced by reflect call elimination"));
+static cl::opt<bool> NVVMReflectDCE(
+    "nvvm-reflect-dce", cl::init(false), cl::Hidden,
+    cl::desc("Delete dead blocks introduced by reflect call elimination"));
 
 // Set the ReflectMap with, first, the value of __CUDA_FTZ from module metadata,
 // and then the key/value pairs from the command line.
@@ -245,7 +245,7 @@ void NVVMReflect::replaceReflectCalls(
 
   for (auto &[Call, NewValue] : ReflectReplacements)
     ReplaceInstructionWithConst(Call, NewValue);
-  
+
   // Constant fold reflect results. If NVVMReflectDCE is enabled, we will
   // alternate between constant folding/propagation and dead block elimination.
   // Terminator folding may create new dead blocks. When those dead blocks are
@@ -261,8 +261,8 @@ void NVVMReflect::replaceReflectCalls(
         BasicBlock *BB = I->getParent();
         SmallVector<BasicBlock *, 8> Succs(successors(BB));
         // Some blocks may become dead if the terminator is folded because
-        // a conditional branch is turned into a direct branch. Add those dead blocks
-        // to the dead blocks set if NVVMReflectDCE is enabled.
+        // a conditional branch is turned into a direct branch. Add those dead
+        // blocks to the dead blocks set if NVVMReflectDCE is enabled.
         if (ConstantFoldTerminator(BB)) {
           for (BasicBlock *Succ : Succs) {
             if (pred_empty(Succ) &&

>From c9b04d7c714493d25b4258b04099bd280447675e Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 30 Jun 2025 07:36:22 +0000
Subject: [PATCH 10/10] bugs

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 36 +++++++++++----------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 093c55a9fb027..6d21706570bbe 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -88,14 +88,9 @@ ModulePass *llvm::createNVVMReflectPass(unsigned SmVersion) {
 }
 
 static cl::opt<bool>
-    NVVMReflectEnabled("nvvm-reflect-enable", cl::init(false), cl::Hidden,
+    NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
                        cl::desc("NVVM reflection, enabled by default"));
 
-char NVVMReflectLegacyPass::ID = 0;
-INITIALIZE_PASS(NVVMReflectLegacyPass, "nvvm-reflect",
-                "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
-                false)
-
 // Allow users to specify additional key/value pairs to reflect. These key/value
 // pairs are the last to be added to the ReflectMap, and therefore will take
 // precedence over initial values (i.e. __CUDA_FTZ from module medadata and
@@ -109,6 +104,11 @@ static cl::opt<bool> NVVMReflectDCE(
     "nvvm-reflect-dce", cl::init(false), cl::Hidden,
     cl::desc("Delete dead blocks introduced by reflect call elimination"));
 
+char NVVMReflectLegacyPass::ID = 0;
+INITIALIZE_PASS(NVVMReflectLegacyPass, "nvvm-reflect",
+                "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
+                false)
+
 // Set the ReflectMap with, first, the value of __CUDA_FTZ from module metadata,
 // and then the key/value pairs from the command line.
 void NVVMReflect::populateReflectMap(Module &M) {
@@ -188,6 +188,8 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
                       << "(" << ReflectArg << ") with value " << ReflectVal
                       << "\n");
     auto *NewValue = ConstantInt::get(Call->getType(), ReflectVal);
+    dbgs() << "NewValue: " << *NewValue << "\n";
+    dbgs() << "Call: " << *Call << "\n";
     ReflectReplacements.push_back({Call, NewValue});
   }
 
@@ -216,35 +218,25 @@ NVVMReflect::findTransitivelyDeadBlocks(BasicBlock *DeadBB) {
 
 /// Replace calls to __nvvm_reflect with corresponding constant values. Then
 /// clean up through constant folding and propagation and dead block
-/// elimination.
-///
-/// The purpose of this cleanup is not optimization because that could be
-/// handled by later passes
-/// (i.e. SCCP, SimplifyCFG, etc.), but for correctness. Reflect calls are most
-/// commonly used to query the arch number and select a valid instruction for
-/// the arch. Therefore, you need to eliminate blocks that become dead because
-/// they may contain invalid instructions for the arch. The purpose of the
-/// cleanup is to do the minimal amount of work to leave the code in a valid
-/// state.
+/// elimination, if NVVMReflectDCE is enabled.
 void NVVMReflect::replaceReflectCalls(
     SmallVector<std::pair<CallInst *, Constant *>, 8> &ReflectReplacements,
     const DataLayout &DL) {
   SmallVector<Instruction *, 8> Worklist;
   SetVector<BasicBlock *> DeadBlocks;
 
-  // Replace an instruction with a constant and add all users to the worklist,
-  // then delete the instruction
+  // Replace an instruction with a constant and add all users to the worklist
   auto ReplaceInstructionWithConst = [&](Instruction *I, Constant *C) {
     for (auto *U : I->users())
       if (auto *UI = dyn_cast<Instruction>(U))
         Worklist.push_back(UI);
     I->replaceAllUsesWith(C);
-    if (isInstructionTriviallyDead(I))
-      I->eraseFromParent();
   };
 
-  for (auto &[Call, NewValue] : ReflectReplacements)
+  for (auto &[Call, NewValue] : ReflectReplacements) {
     ReplaceInstructionWithConst(Call, NewValue);
+    Call->eraseFromParent();
+  }
 
   // Constant fold reflect results. If NVVMReflectDCE is enabled, we will
   // alternate between constant folding/propagation and dead block elimination.
@@ -257,6 +249,8 @@ void NVVMReflect::replaceReflectCalls(
       auto *I = Worklist.pop_back_val();
       if (auto *C = ConstantFoldInstruction(I, DL)) {
         ReplaceInstructionWithConst(I, C);
+        if (isInstructionTriviallyDead(I))
+          I->eraseFromParent();
       } else if (I->isTerminator()) {
         BasicBlock *BB = I->getParent();
         SmallVector<BasicBlock *, 8> Succs(successors(BB));