[clang] [compiler-rt] [lld] [llvm] [NVPTX] Improve NVVMReflect Efficiency (PR #134416)

Thu Apr 10 15:18:11 PDT 2025

https://github.com/YonahGoldberg updated https://github.com/llvm/llvm-project/pull/134416

>From 21ec83872e1846e1f2453d1dafd8651a76d20494 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 4 Apr 2025 16:51:40 +0000
Subject: [PATCH 01/38] making nvvm reflect more efficient

---
 llvm/lib/Target/NVPTX/NVPTX.h                |  10 +-
 llvm/lib/Target/NVPTX/NVPTXPassRegistry.def  |   2 +-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp |   5 +-
 llvm/lib/Target/NVPTX/NVVMReflect.cpp        | 155 ++++++++++++-------
 4 files changed, 112 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index ff983e52179af..f98ace3a0d189 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -43,7 +43,7 @@ ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMLegacyPass();
 ModulePass *createNVPTXCtorDtorLoweringLegacyPass();
 FunctionPass *createNVVMIntrRangePass();
-FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
+ModulePass *createNVVMReflectPass(unsigned int SmVersion);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
@@ -78,12 +78,12 @@ struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
 };
 
 struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
-  NVVMReflectPass();
-  NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  NVVMReflectPass() : NVVMReflectPass(0) {}
+  NVVMReflectPass(unsigned SmVersion);
+  PreservedAnalyses run(Module &F, ModuleAnalysisManager &AM);
 
 private:
-  unsigned SmVersion;
+  StringMap<int> VarMap;
 };
 
 struct GenericToNVVMPass : PassInfoMixin<GenericToNVVMPass> {
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index 34c79b8f77bae..1c813c2c51f70 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -18,6 +18,7 @@
 #endif
 MODULE_PASS("generic-to-nvvm", GenericToNVVMPass())
 MODULE_PASS("nvptx-lower-ctor-dtor", NVPTXCtorDtorLoweringPass())
+MODULE_PASS("nvvm-reflect", NVVMReflectPass())
 #undef MODULE_PASS
 
 #ifndef FUNCTION_ANALYSIS
@@ -36,7 +37,6 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA())
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
-FUNCTION_PASS("nvvm-reflect", NVVMReflectPass())
 FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
 FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this));
 #undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 8a25256ea1e4a..d4376ef87b1f1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -240,11 +240,12 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   PB.registerPipelineStartEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
-        FunctionPassManager FPM;
         // We do not want to fold out calls to nvvm.reflect early if the user
         // has not provided a target architecture just yet.
         if (Subtarget.hasTargetName())
-          FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
+          PM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
+        
+        FunctionPassManager FPM;
         // Note: NVVMIntrRangePass was causing numerical discrepancies at one
         // point, if issues crop up, consider disabling.
         FPM.addPass(NVVMIntrRangePass());
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 2809ec2303f99..96e23a6699666 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-//===----------------------------------------------------------------------===//
+
 //
 // This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
 // with an integer.
@@ -25,7 +25,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
@@ -39,27 +38,43 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/StripGCRelocates.h"
 #include <algorithm>
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
 #define NVVM_REFLECT_OCL_FUNCTION "__nvvm_reflect_ocl"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "nvptx-reflect"
+#define DEBUG_TYPE "nvvm-reflect"
 
 namespace {
-class NVVMReflect : public FunctionPass {
+class NVVMReflect : public ModulePass {
+private:
+  StringMap<int> VarMap;
+  /// Process a reflect function by finding all its uses and replacing them with
+  /// appropriate constant values. For __CUDA_FTZ, uses the module flag value.
+  /// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0.
+  bool handleReflectFunction(Function *F);
+  void setVarMap(Module &M);
+
 public:
   static char ID;
-  unsigned int SmVersion;
   NVVMReflect() : NVVMReflect(0) {}
-  explicit NVVMReflect(unsigned int Sm) : FunctionPass(ID), SmVersion(Sm) {}
-
-  bool runOnFunction(Function &) override;
+  // __CUDA_FTZ is assigned in `runOnModule` by checking nvvm-reflect-ftz module
+  // metadata.
+  explicit NVVMReflect(unsigned int Sm) : ModulePass(ID) {
+    VarMap["__CUDA_ARCH"] = Sm * 10;
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+  }
+  // This mapping will contain should include __CUDA_FTZ and __CUDA_ARCH values.
+  explicit NVVMReflect(const StringMap<int> &Mapping) : ModulePass(ID), VarMap(Mapping) {
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
 };
 } // namespace
 
-FunctionPass *llvm::createNVVMReflectPass(unsigned int SmVersion) {
+ModulePass *llvm::createNVVMReflectPass(unsigned int SmVersion) {
   return new NVVMReflect(SmVersion);
 }
 
@@ -72,27 +87,51 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                 "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                 false)
 
-static bool runNVVMReflect(Function &F, unsigned SmVersion) {
-  if (!NVVMReflectEnabled)
-    return false;
+static cl::list<std::string>
+    ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
+                cl::desc("A list of string=num assignments"),
+                cl::ValueRequired);
 
-  if (F.getName() == NVVM_REFLECT_FUNCTION ||
-      F.getName() == NVVM_REFLECT_OCL_FUNCTION) {
-    assert(F.isDeclaration() && "_reflect function should not have a body");
-    assert(F.getReturnType()->isIntegerTy() &&
-           "_reflect's return type should be integer");
-    return false;
+/// The command line can look as follows :
+/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
+/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
+/// ReflectList vector. First, each of ReflectList[i] is 'split'
+/// using "," as the delimiter. Then each of this part is split
+/// using "=" as the delimiter.
+void NVVMReflect::setVarMap(Module &M) {
+  if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("nvvm-reflect-ftz")))
+    VarMap["__CUDA_FTZ"] = Flag->getSExtValue();
+
+  for (unsigned I = 0, E = ReflectList.size(); I != E; ++I) {
+    LLVM_DEBUG(dbgs() << "Option : " << ReflectList[I] << "\n");
+    SmallVector<StringRef, 4> NameValList;
+    StringRef(ReflectList[I]).split(NameValList, ",");
+    for (unsigned J = 0, EJ = NameValList.size(); J != EJ; ++J) {
+      SmallVector<StringRef, 2> NameValPair;
+      NameValList[J].split(NameValPair, "=");
+      assert(NameValPair.size() == 2 && "name=val expected");
+      StringRef ValStr = NameValPair[1].trim();
+      int Val;
+      if (ValStr.getAsInteger(10, Val))
+        report_fatal_error("integer value expected");
+      VarMap[NameValPair[0]] = Val;
+    }
   }
+}
+
+bool NVVMReflect::handleReflectFunction(Function *F) {
+  // Validate _reflect function
+  assert(F->isDeclaration() && "_reflect function should not have a body");
+  assert(F->getReturnType()->isIntegerTy() &&
+         "_reflect's return type should be integer");
 
   SmallVector<Instruction *, 4> ToRemove;
   SmallVector<Instruction *, 4> ToSimplify;
 
-  // Go through the calls in this function.  Each call to __nvvm_reflect or
-  // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
-  // First validate that. If the c-string corresponding to the ConstantArray can
-  // be found successfully, see if it can be found in VarMap. If so, replace the
-  // uses of CallInst with the value found in VarMap. If not, replace the use
-  // with value 0.
+  // Go through the uses of the reflect function. Each use should be a CallInst
+  // with a ConstantArray argument. Replace the uses with the appropriate
+  // constant values.
 
   // The IR for __nvvm_reflect calls differs between CUDA versions.
   //
@@ -113,15 +152,10 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   //
   // In this case, we get a Constant with a GlobalVariable operand and we need
   // to dig deeper to find its initializer with the string we'll use for lookup.
-  for (Instruction &I : instructions(F)) {
-    CallInst *Call = dyn_cast<CallInst>(&I);
-    if (!Call)
-      continue;
-    Function *Callee = Call->getCalledFunction();
-    if (!Callee || (Callee->getName() != NVVM_REFLECT_FUNCTION &&
-                    Callee->getName() != NVVM_REFLECT_OCL_FUNCTION &&
-                    Callee->getIntrinsicID() != Intrinsic::nvvm_reflect))
-      continue;
+
+  for (User *U : F->users()) {
+    assert(isa<CallInst>(U) && "Only a call instruction can use _reflect");
+    CallInst *Call = cast<CallInst>(U);
 
     // FIXME: Improve error handling here and elsewhere in this pass.
     assert(Call->getNumOperands() == 2 &&
@@ -156,20 +190,15 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
            "Format of _reflect function not recognized");
 
     StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
+    // Remove the null terminator from the string
     ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
     LLVM_DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
 
     int ReflectVal = 0; // The default value is 0
-    if (ReflectArg == "__CUDA_FTZ") {
-      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.  Our
-      // choice here must be kept in sync with AutoUpgrade, which uses the same
-      // technique to detect whether ftz is enabled.
-      if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
-              F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
-        ReflectVal = Flag->getSExtValue();
-    } else if (ReflectArg == "__CUDA_ARCH") {
-      ReflectVal = SmVersion * 10;
+    if (VarMap.contains(ReflectArg)) {
+      ReflectVal = VarMap[ReflectArg];
     }
+    LLVM_DEBUG(dbgs() << "ReflectVal: " << ReflectVal << "\n");
 
     // If the immediate user is a simple comparison we want to simplify it.
     for (User *U : Call->users())
@@ -185,7 +214,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   // until we find a terminator that we can then remove.
   while (!ToSimplify.empty()) {
     Instruction *I = ToSimplify.pop_back_val();
-    if (Constant *C = ConstantFoldInstruction(I, F.getDataLayout())) {
+    if (Constant *C = ConstantFoldInstruction(I, F->getDataLayout())) {
       for (User *U : I->users())
         if (Instruction *I = dyn_cast<Instruction>(U))
           ToSimplify.push_back(I);
@@ -202,23 +231,45 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   // Removing via isInstructionTriviallyDead may add duplicates to the ToRemove
   // array. Filter out the duplicates before starting to erase from parent.
   std::sort(ToRemove.begin(), ToRemove.end());
-  auto NewLastIter = llvm::unique(ToRemove);
+  auto *NewLastIter = llvm::unique(ToRemove);
   ToRemove.erase(NewLastIter, ToRemove.end());
 
   for (Instruction *I : ToRemove)
     I->eraseFromParent();
 
+  // Remove the __nvvm_reflect function from the module
+  F->eraseFromParent();
   return ToRemove.size() > 0;
 }
 
-bool NVVMReflect::runOnFunction(Function &F) {
-  return runNVVMReflect(F, SmVersion);
-}
+bool NVVMReflect::runOnModule(Module &M) {
+  if (!NVVMReflectEnabled)
+    return false;
+
+  setVarMap(M);
 
-NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {}
+  bool Changed = false;
+  // Names of reflect function to find and replace
+  SmallVector<std::string, 3> ReflectNames = {
+      NVVM_REFLECT_FUNCTION, NVVM_REFLECT_OCL_FUNCTION,
+      Intrinsic::getName(Intrinsic::nvvm_reflect).str()};
+
+  // Process all reflect functions
+  for (const std::string &Name : ReflectNames) {
+    Function *ReflectFunction = M.getFunction(Name);
+    if (ReflectFunction) {
+      Changed |= handleReflectFunction(ReflectFunction);
+    }
+  }
+
+  return Changed;
+}
 
-PreservedAnalyses NVVMReflectPass::run(Function &F,
-                                       FunctionAnalysisManager &AM) {
-  return runNVVMReflect(F, SmVersion) ? PreservedAnalyses::none()
-                                      : PreservedAnalyses::all();
+// Implementations for the pass that works with the new pass manager.
+NVVMReflectPass::NVVMReflectPass(unsigned SmVersion) {
+  VarMap["__CUDA_ARCH"] = SmVersion * 10;
 }
+PreservedAnalyses NVVMReflectPass::run(Module &M, ModuleAnalysisManager &AM) {
+  return NVVMReflect(VarMap).runOnModule(M) ? PreservedAnalyses::none()
+                                            : PreservedAnalyses::all();
+}
\ No newline at end of file

>From cc497638e698ad324ffbe484cd6171f3de54561d Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 4 Apr 2025 16:59:38 +0000
Subject: [PATCH 02/38] cleanup

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 96e23a6699666..85a714b1d5aec 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-
+//===----------------------------------------------------------------------===//
 //
 // This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
 // with an integer.

>From db16866c2e8afc4fa30715321b052c66dd895f10 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Fri, 4 Apr 2025 17:17:14 +0000
Subject: [PATCH 03/38] newline

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 85a714b1d5aec..f0f7f2baf0206 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -272,4 +272,4 @@ NVVMReflectPass::NVVMReflectPass(unsigned SmVersion) {
 PreservedAnalyses NVVMReflectPass::run(Module &M, ModuleAnalysisManager &AM) {
   return NVVMReflect(VarMap).runOnModule(M) ? PreservedAnalyses::none()
                                             : PreservedAnalyses::all();
-}
\ No newline at end of file
+}

>From f167f2c6def6858cf2904520c0f3ba4536b56bc7 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 7 Apr 2025 20:05:06 +0000
Subject: [PATCH 04/38] reflect improvements

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp         | 183 ++++++++++--------
 .../CodeGen/NVPTX/nvvm-reflect-options.ll     |  26 +++
 2 files changed, 124 insertions(+), 85 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index f0f7f2baf0206..35546d05ab5be 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -4,6 +4,12 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
+// NVIDIA_COPYRIGHT_BEGIN
+//
+// Copyright (c) 2023-2025, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA_COPYRIGHT_END
+//
 //===----------------------------------------------------------------------===//
 //
 // This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
@@ -38,8 +44,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/StripGCRelocates.h"
-#include <algorithm>
+#include "llvm/ADT/StringExtras.h"
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
 #define NVVM_REFLECT_OCL_FUNCTION "__nvvm_reflect_ocl"
 
@@ -54,16 +59,15 @@ class NVVMReflect : public ModulePass {
   /// Process a reflect function by finding all its uses and replacing them with
   /// appropriate constant values. For __CUDA_FTZ, uses the module flag value.
   /// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0.
-  bool handleReflectFunction(Function *F);
+  void handleReflectFunction(Function *F);
   void setVarMap(Module &M);
-
+  void foldReflectCall(CallInst *Call, Constant *NewValue);
 public:
   static char ID;
   NVVMReflect() : NVVMReflect(0) {}
   // __CUDA_FTZ is assigned in `runOnModule` by checking nvvm-reflect-ftz module
   // metadata.
-  explicit NVVMReflect(unsigned int Sm) : ModulePass(ID) {
-    VarMap["__CUDA_ARCH"] = Sm * 10;
+  explicit NVVMReflect(unsigned SmVersion) : ModulePass(ID), VarMap({{"__CUDA_ARCH", SmVersion * 10}}) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
   }
   // This mapping will contain should include __CUDA_FTZ and __CUDA_ARCH values.
@@ -87,51 +91,58 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                 "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                 false)
 
+// Allow users to specify additional key/value pairs to reflect. These key/value pairs
+// are the last to be added to the VarMap, and therefore will take precedence over initial
+// values (i.e. __CUDA_FTZ from module medadata and __CUDA_ARCH from SmVersion).
 static cl::list<std::string>
-    ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
-                cl::desc("A list of string=num assignments"),
-                cl::ValueRequired);
-
-/// The command line can look as follows :
-/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
-/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
-/// ReflectList vector. First, each of ReflectList[i] is 'split'
-/// using "," as the delimiter. Then each of this part is split
-/// using "=" as the delimiter.
+ReflectList("nvvm-reflect-add", cl::value_desc("name=<int>"), cl::Hidden,
+            cl::desc("list of comma-separated key=value pairs"),
+            cl::ValueRequired);
+
+// Set the VarMap with, first, the value of __CUDA_FTZ from module metadata, and then
+// the key/value pairs from the command line.
 void NVVMReflect::setVarMap(Module &M) {
+  LLVM_DEBUG(dbgs() << "Reflect list values:\n");
+  for (StringRef Option : ReflectList) {
+    LLVM_DEBUG(dbgs() << "  " << Option << "\n");
+  }
   if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
-          M.getModuleFlag("nvvm-reflect-ftz")))
+      M.getModuleFlag("nvvm-reflect-ftz")))
     VarMap["__CUDA_FTZ"] = Flag->getSExtValue();
 
-  for (unsigned I = 0, E = ReflectList.size(); I != E; ++I) {
-    LLVM_DEBUG(dbgs() << "Option : " << ReflectList[I] << "\n");
-    SmallVector<StringRef, 4> NameValList;
-    StringRef(ReflectList[I]).split(NameValList, ",");
-    for (unsigned J = 0, EJ = NameValList.size(); J != EJ; ++J) {
-      SmallVector<StringRef, 2> NameValPair;
-      NameValList[J].split(NameValPair, "=");
-      assert(NameValPair.size() == 2 && "name=val expected");
-      StringRef ValStr = NameValPair[1].trim();
+  /// The command line can look as follows :
+  /// -nvvm-reflect-add a=1,b=2 -nvvm-reflect-add c=3,d=0 -nvvm-reflect-add e=2
+  /// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
+  /// ReflectList vector. First, each of ReflectList[i] is 'split'
+  /// using "," as the delimiter. Then each of this part is split
+  /// using "=" as the delimiter.
+  for (StringRef Option : ReflectList) {
+    LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n");
+    while (!Option.empty()) {
+      std::pair<StringRef, StringRef> Split = Option.split(',');
+      StringRef NameVal = Split.first;
+      Option = Split.second;
+
+      auto NameValPair = NameVal.split('=');
+      assert(!NameValPair.first.empty() && !NameValPair.second.empty() && 
+             "name=val expected");
+      
       int Val;
-      if (ValStr.getAsInteger(10, Val))
+      if (!to_integer(NameValPair.second.trim(), Val, 10))
         report_fatal_error("integer value expected");
-      VarMap[NameValPair[0]] = Val;
+      VarMap[NameValPair.first] = Val;
     }
   }
 }
 
-bool NVVMReflect::handleReflectFunction(Function *F) {
+void NVVMReflect::handleReflectFunction(Function *F) {
   // Validate _reflect function
   assert(F->isDeclaration() && "_reflect function should not have a body");
-  assert(F->getReturnType()->isIntegerTy() &&
-         "_reflect's return type should be integer");
+  assert(F->getReturnType()->isIntegerTy() && "_reflect's return type should be integer");
 
-  SmallVector<Instruction *, 4> ToRemove;
-  SmallVector<Instruction *, 4> ToSimplify;
 
   // Go through the uses of the reflect function. Each use should be a CallInst
-  // with a ConstantArray argument. Replace the uses with the appropriate
-  // constant values.
+  // with a ConstantArray argument. Replace the uses with the appropriate constant values.
 
   // The IR for __nvvm_reflect calls differs between CUDA versions.
   //
@@ -153,7 +164,7 @@ bool NVVMReflect::handleReflectFunction(Function *F) {
   // In this case, we get a Constant with a GlobalVariable operand and we need
   // to dig deeper to find its initializer with the string we'll use for lookup.
 
-  for (User *U : F->users()) {
+  for (User *U : make_early_inc_range(F->users())) {
     assert(isa<CallInst>(U) && "Only a call instruction can use _reflect");
     CallInst *Call = cast<CallInst>(U);
 
@@ -165,21 +176,23 @@ bool NVVMReflect::handleReflectFunction(Function *F) {
     // conversion of the string.
     const Value *Str = Call->getArgOperand(0);
     if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) {
-      // FIXME: Add assertions about ConvCall.
+      // Verify this is the constant-to-generic intrinsic
+      Function *Callee = ConvCall->getCalledFunction();
+      assert(Callee && Callee->isIntrinsic() && 
+             Callee->getName().starts_with("llvm.nvvm.ptr.constant.to.gen") &&
+             "Expected llvm.nvvm.ptr.constant.to.gen intrinsic");
+      assert(ConvCall->getNumOperands() == 2 && "Expected one argument for ptr conversion");
       Str = ConvCall->getArgOperand(0);
     }
     // Pre opaque pointers we have a constant expression wrapping the constant
-    // string.
     Str = Str->stripPointerCasts();
-    assert(isa<Constant>(Str) &&
-           "Format of __nvvm_reflect function not recognized");
+    assert(isa<Constant>(Str) && "Format of __nvvm_reflect function not recognized");
 
     const Value *Operand = cast<Constant>(Str)->getOperand(0);
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) {
       // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's
       // initializer.
-      assert(GV->hasInitializer() &&
-             "Format of _reflect function not recognized");
+      assert(GV->hasInitializer() && "Format of _reflect function not recognized");
       const Constant *Initializer = GV->getInitializer();
       Operand = Initializer;
     }
@@ -192,54 +205,48 @@ bool NVVMReflect::handleReflectFunction(Function *F) {
     StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
     // Remove the null terminator from the string
     ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
-    LLVM_DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
 
     int ReflectVal = 0; // The default value is 0
     if (VarMap.contains(ReflectArg)) {
       ReflectVal = VarMap[ReflectArg];
     }
-    LLVM_DEBUG(dbgs() << "ReflectVal: " << ReflectVal << "\n");
+    LLVM_DEBUG(dbgs() << "Replacing call of reflect function " << F->getName() << "(" << ReflectArg << ") with value " << ReflectVal << "\n");
+    Constant *NewValue = ConstantInt::get(Call->getType(), ReflectVal);
+    foldReflectCall(Call, NewValue);
+    Call->eraseFromParent();
+  }
 
-    // If the immediate user is a simple comparison we want to simplify it.
-    for (User *U : Call->users())
-      if (Instruction *I = dyn_cast<Instruction>(U))
-        ToSimplify.push_back(I);
+  // Remove the __nvvm_reflect function from the module
+  F->eraseFromParent();
+}
 
-    Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
-    ToRemove.push_back(Call);
+void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
+  // Initialize worklist with all users of the call
+  SmallVector<Instruction*, 8> Worklist;
+  for (User *U : Call->users()) {
+    if (Instruction *I = dyn_cast<Instruction>(U)) {
+      Worklist.push_back(I);
+    }
   }
 
-  // The code guarded by __nvvm_reflect may be invalid for the target machine.
-  // Traverse the use-def chain, continually simplifying constant expressions
-  // until we find a terminator that we can then remove.
-  while (!ToSimplify.empty()) {
-    Instruction *I = ToSimplify.pop_back_val();
-    if (Constant *C = ConstantFoldInstruction(I, F->getDataLayout())) {
-      for (User *U : I->users())
-        if (Instruction *I = dyn_cast<Instruction>(U))
-          ToSimplify.push_back(I);
+  Call->replaceAllUsesWith(NewValue);
 
-      I->replaceAllUsesWith(C);
-      if (isInstructionTriviallyDead(I)) {
-        ToRemove.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    if (Constant *C = ConstantFoldInstruction(I, Call->getModule()->getDataLayout())) {
+      // Add all users of this instruction to the worklist, replace it with the constant
+      // then delete it if it's dead
+      for (User *U : I->users()) {
+        if (Instruction *UI = dyn_cast<Instruction>(U))
+          Worklist.push_back(UI);
       }
+      I->replaceAllUsesWith(C);
+      if (isInstructionTriviallyDead(I))
+        I->eraseFromParent();
     } else if (I->isTerminator()) {
       ConstantFoldTerminator(I->getParent());
     }
   }
-
-  // Removing via isInstructionTriviallyDead may add duplicates to the ToRemove
-  // array. Filter out the duplicates before starting to erase from parent.
-  std::sort(ToRemove.begin(), ToRemove.end());
-  auto *NewLastIter = llvm::unique(ToRemove);
-  ToRemove.erase(NewLastIter, ToRemove.end());
-
-  for (Instruction *I : ToRemove)
-    I->eraseFromParent();
-
-  // Remove the __nvvm_reflect function from the module
-  F->eraseFromParent();
-  return ToRemove.size() > 0;
 }
 
 bool NVVMReflect::runOnModule(Module &M) {
@@ -250,15 +257,19 @@ bool NVVMReflect::runOnModule(Module &M) {
 
   bool Changed = false;
   // Names of reflect function to find and replace
-  SmallVector<std::string, 3> ReflectNames = {
-      NVVM_REFLECT_FUNCTION, NVVM_REFLECT_OCL_FUNCTION,
-      Intrinsic::getName(Intrinsic::nvvm_reflect).str()};
+  SmallVector<StringRef, 5> ReflectNames = {
+      NVVM_REFLECT_FUNCTION,
+      NVVM_REFLECT_OCL_FUNCTION,
+      Intrinsic::getName(Intrinsic::nvvm_reflect),
+  };
 
   // Process all reflect functions
-  for (const std::string &Name : ReflectNames) {
-    Function *ReflectFunction = M.getFunction(Name);
-    if (ReflectFunction) {
-      Changed |= handleReflectFunction(ReflectFunction);
+  for (StringRef Name : ReflectNames) {
+    if (Function *ReflectFunction = M.getFunction(Name)) {
+      // If the reflect functition is called, we need to replace the call
+      // with the appropriate constant, modifying the IR.
+      Changed |= ReflectFunction->getNumUses() > 0;
+      handleReflectFunction(ReflectFunction);
     }
   }
 
@@ -269,7 +280,9 @@ bool NVVMReflect::runOnModule(Module &M) {
 NVVMReflectPass::NVVMReflectPass(unsigned SmVersion) {
   VarMap["__CUDA_ARCH"] = SmVersion * 10;
 }
-PreservedAnalyses NVVMReflectPass::run(Module &M, ModuleAnalysisManager &AM) {
+
+PreservedAnalyses NVVMReflectPass::run(Module &M,
+                                    ModuleAnalysisManager &AM) {
   return NVVMReflect(VarMap).runOnModule(M) ? PreservedAnalyses::none()
-                                            : PreservedAnalyses::all();
-}
+                                   : PreservedAnalyses::all();
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
new file mode 100644
index 0000000000000..54087897d65b5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
@@ -0,0 +1,26 @@
+; Verify that when passing in command-line options to NVVMReflect, that reflect calls are replaced with
+; the appropriate command line values.
+
+declare i32 @__nvvm_reflect(ptr)
+ at ftz = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
+ at arch = private unnamed_addr addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
+
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add=__CUDA_FTZ=1,__CUDA_ARCH=350 %s -S | FileCheck %s --check-prefix=CHECK-FTZ1-ARCH350
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add=__CUDA_FTZ=0 -nvvm-reflect-add=__CUDA_ARCH=520 %s -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
+
+; Verify that if we have module metadata that sets __CUDA_FTZ=1, that gets overridden by the command line arguments
+
+; RUN: cat %s > %t.options
+; RUN: echo '!llvm.module.flags = !{!0}' >> %t.options
+; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.options
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add=__CUDA_FTZ=0,__CUDA_ARCH=520 %t.options -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
+
+define i32 @options() {
+  %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @ftz to ptr))
+  %2 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @arch to ptr))
+  %3 = add i32 %1, %2
+  ret i32 %3
+}
+
+; CHECK-FTZ1-ARCH350: ret i32 351
+; CHECK-FTZ0-ARCH520: ret i32 520
\ No newline at end of file

>From 78232a8ffc4fe28f921170ddcb040601f2cc858b Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 7 Apr 2025 20:07:23 +0000
Subject: [PATCH 05/38] comment move

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 35546d05ab5be..859b63c54bebd 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -56,9 +56,6 @@ namespace {
 class NVVMReflect : public ModulePass {
 private:
   StringMap<int> VarMap;
-  /// Process a reflect function by finding all its uses and replacing them with
-  /// appropriate constant values. For __CUDA_FTZ, uses the module flag value.
-  /// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0.
   void handleReflectFunction(Function *F);
   void setVarMap(Module &M);
   void foldReflectCall(CallInst *Call, Constant *NewValue);
@@ -135,6 +132,9 @@ void NVVMReflect::setVarMap(Module &M) {
   }
 }
 
+/// Process a reflect function by finding all its uses and replacing them with
+/// appropriate constant values. For __CUDA_FTZ, uses the module flag value.
+/// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0.
 void NVVMReflect::handleReflectFunction(Function *F) {
   // Validate _reflect function
   assert(F->isDeclaration() && "_reflect function should not have a body");

>From 7385210ce4dd110f293d5fd74b3980c694e123eb Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 7 Apr 2025 20:11:13 +0000
Subject: [PATCH 06/38] auto

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 859b63c54bebd..b39131de4b7bf 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -116,7 +116,7 @@ void NVVMReflect::setVarMap(Module &M) {
   for (StringRef Option : ReflectList) {
     LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n");
     while (!Option.empty()) {
-      std::pair<StringRef, StringRef> Split = Option.split(',');
+      auto Split = Option.split(',');
       StringRef NameVal = Split.first;
       Option = Split.second;
 

>From 453c62981c54f9a5fbd46324cebb17c90963ab07 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 7 Apr 2025 20:17:49 +0000
Subject: [PATCH 07/38] remove nvidia copyright

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index b39131de4b7bf..0915456c3915b 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -4,12 +4,6 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// NVIDIA_COPYRIGHT_BEGIN
-//
-// Copyright (c) 2023-2025, NVIDIA CORPORATION.  All rights reserved.
-//
-// NVIDIA_COPYRIGHT_END
-//
 //===----------------------------------------------------------------------===//
 //
 // This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect

>From 8e95772bac5cfcbd8da612d09c2f71b17722fa58 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 7 Apr 2025 20:22:45 +0000
Subject: [PATCH 08/38] improve error messages

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 34 ++++++++++++++++-----------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 0915456c3915b..2533c1cbee25d 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -159,12 +159,12 @@ void NVVMReflect::handleReflectFunction(Function *F) {
   // to dig deeper to find its initializer with the string we'll use for lookup.
 
   for (User *U : make_early_inc_range(F->users())) {
-    assert(isa<CallInst>(U) && "Only a call instruction can use _reflect");
+    if (!isa<CallInst>(U))
+      report_fatal_error("__nvvm_reflect can only be used in a call instruction");
     CallInst *Call = cast<CallInst>(U);
 
-    // FIXME: Improve error handling here and elsewhere in this pass.
-    assert(Call->getNumOperands() == 2 &&
-           "Wrong number of operands to __nvvm_reflect function");
+    if (Call->getNumOperands() != 2)
+      report_fatal_error("__nvvm_reflect requires exactly one argument");
 
     // In cuda 6.5 and earlier, we will have an extra constant-to-generic
     // conversion of the string.
@@ -172,34 +172,40 @@ void NVVMReflect::handleReflectFunction(Function *F) {
     if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) {
       // Verify this is the constant-to-generic intrinsic
       Function *Callee = ConvCall->getCalledFunction();
-      assert(Callee && Callee->isIntrinsic() && 
-             Callee->getName().starts_with("llvm.nvvm.ptr.constant.to.gen") &&
-             "Expected llvm.nvvm.ptr.constant.to.gen intrinsic");
-      assert(ConvCall->getNumOperands() == 2 && "Expected one argument for ptr conversion");
+      if (!Callee || !Callee->isIntrinsic() || 
+          !Callee->getName().starts_with("llvm.nvvm.ptr.constant.to.gen"))
+        report_fatal_error("Expected llvm.nvvm.ptr.constant.to.gen intrinsic");
+      if (ConvCall->getNumOperands() != 2)
+        report_fatal_error("Expected one argument for ptr conversion");
       Str = ConvCall->getArgOperand(0);
     }
     // Pre opaque pointers we have a constant expression wrapping the constant
     Str = Str->stripPointerCasts();
-    assert(isa<Constant>(Str) && "Format of __nvvm_reflect function not recognized");
+    if (!isa<Constant>(Str))
+      report_fatal_error("__nvvm_reflect argument must be a constant string");
 
     const Value *Operand = cast<Constant>(Str)->getOperand(0);
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) {
       // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's
       // initializer.
-      assert(GV->hasInitializer() && "Format of _reflect function not recognized");
+      if (!GV->hasInitializer())
+        report_fatal_error("__nvvm_reflect string must have an initializer");
       const Constant *Initializer = GV->getInitializer();
       Operand = Initializer;
     }
 
-    assert(isa<ConstantDataSequential>(Operand) &&
-           "Format of _reflect function not recognized");
-    assert(cast<ConstantDataSequential>(Operand)->isCString() &&
-           "Format of _reflect function not recognized");
+    if (!isa<ConstantDataSequential>(Operand))
+      report_fatal_error("__nvvm_reflect argument must be a string constant");
+    if (!cast<ConstantDataSequential>(Operand)->isCString())
+      report_fatal_error("__nvvm_reflect argument must be a null-terminated string");
 
     StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
     // Remove the null terminator from the string
     ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
 
+    if (ReflectArg.empty())
+      report_fatal_error("__nvvm_reflect argument cannot be empty");
+
     int ReflectVal = 0; // The default value is 0
     if (VarMap.contains(ReflectArg)) {
       ReflectVal = VarMap[ReflectArg];

>From d9a0ec6733c8f442c638087b4b1099ef4317433a Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 7 Apr 2025 21:10:36 +0000
Subject: [PATCH 09/38] fix command line options

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp         | 32 +++++++------------
 .../CodeGen/NVPTX/nvvm-reflect-options.ll     |  6 ++--
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 2533c1cbee25d..2cfd74303a821 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -86,7 +86,8 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
 // are the last to be added to the VarMap, and therefore will take precedence over initial
 // values (i.e. __CUDA_FTZ from module medadata and __CUDA_ARCH from SmVersion).
 static cl::list<std::string>
-ReflectList("nvvm-reflect-add", cl::value_desc("name=<int>"), cl::Hidden,
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
+            cl::CommaSeparated,
             cl::desc("list of comma-separated key=value pairs"),
             cl::ValueRequired);
 
@@ -101,28 +102,17 @@ void NVVMReflect::setVarMap(Module &M) {
       M.getModuleFlag("nvvm-reflect-ftz")))
     VarMap["__CUDA_FTZ"] = Flag->getSExtValue();
 
-  /// The command line can look as follows :
-  /// -nvvm-reflect-add a=1,b=2 -nvvm-reflect-add c=3,d=0 -nvvm-reflect-add e=2
-  /// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
-  /// ReflectList vector. First, each of ReflectList[i] is 'split'
-  /// using "," as the delimiter. Then each of this part is split
-  /// using "=" as the delimiter.
   for (StringRef Option : ReflectList) {
     LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n");
-    while (!Option.empty()) {
-      auto Split = Option.split(',');
-      StringRef NameVal = Split.first;
-      Option = Split.second;
-
-      auto NameValPair = NameVal.split('=');
-      assert(!NameValPair.first.empty() && !NameValPair.second.empty() && 
-             "name=val expected");
-      
-      int Val;
-      if (!to_integer(NameValPair.second.trim(), Val, 10))
-        report_fatal_error("integer value expected");
-      VarMap[NameValPair.first] = Val;
-    }
+    auto [Name, Val] = Option.split('=');
+    if (Name.empty())
+      report_fatal_error("Empty name in nvvm-reflect-list option '" + Option + "'");
+    if (Val.empty()) 
+      report_fatal_error("Missing value in nvvm-reflect- option '" + Option + "'");
+    int ValInt;
+    if (!to_integer(Val.trim(), ValInt, 10))
+      report_fatal_error("integer value expected in nvvm-reflect-list option '" + Option + "'");
+    VarMap[Name] = ValInt;
   }
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
index 54087897d65b5..bd4fb3eb537d0 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
@@ -5,15 +5,15 @@ declare i32 @__nvvm_reflect(ptr)
 @ftz = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
 @arch = private unnamed_addr addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
 
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add=__CUDA_FTZ=1,__CUDA_ARCH=350 %s -S | FileCheck %s --check-prefix=CHECK-FTZ1-ARCH350
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add=__CUDA_FTZ=0 -nvvm-reflect-add=__CUDA_ARCH=520 %s -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-list=__CUDA_FTZ=1,__CUDA_ARCH=350 %s -S | FileCheck %s --check-prefix=CHECK-FTZ1-ARCH350
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-list=__CUDA_FTZ=0 -nvvm-reflect-list=__CUDA_ARCH=520 %s -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
 
 ; Verify that if we have module metadata that sets __CUDA_FTZ=1, that gets overridden by the command line arguments
 
 ; RUN: cat %s > %t.options
 ; RUN: echo '!llvm.module.flags = !{!0}' >> %t.options
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.options
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add=__CUDA_FTZ=0,__CUDA_ARCH=520 %t.options -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-list=__CUDA_FTZ=0,__CUDA_ARCH=520 %t.options -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
 
 define i32 @options() {
   %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @ftz to ptr))

>From f5444f388f3de1869ed4681260265829485c366c Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 7 Apr 2025 21:57:26 +0000
Subject: [PATCH 10/38] fix command line options

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp           | 9 ++-------
 llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll | 6 +++---
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 2cfd74303a821..b1e01a1e3fe71 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -86,18 +86,13 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
 // are the last to be added to the VarMap, and therefore will take precedence over initial
 // values (i.e. __CUDA_FTZ from module medadata and __CUDA_ARCH from SmVersion).
 static cl::list<std::string>
-ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
-            cl::CommaSeparated,
-            cl::desc("list of comma-separated key=value pairs"),
+ReflectList("nvvm-reflect-add", cl::value_desc("name=<int>"), cl::Hidden,
+            cl::desc("A key=value pair. Replace __nvvm_reflect(name) with value."),
             cl::ValueRequired);
 
 // Set the VarMap with, first, the value of __CUDA_FTZ from module metadata, and then
 // the key/value pairs from the command line.
 void NVVMReflect::setVarMap(Module &M) {
-  LLVM_DEBUG(dbgs() << "Reflect list values:\n");
-  for (StringRef Option : ReflectList) {
-    LLVM_DEBUG(dbgs() << "  " << Option << "\n");
-  }
   if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
       M.getModuleFlag("nvvm-reflect-ftz")))
     VarMap["__CUDA_FTZ"] = Flag->getSExtValue();
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
index bd4fb3eb537d0..fae48e554383b 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
@@ -5,15 +5,15 @@ declare i32 @__nvvm_reflect(ptr)
 @ftz = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
 @arch = private unnamed_addr addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
 
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-list=__CUDA_FTZ=1,__CUDA_ARCH=350 %s -S | FileCheck %s --check-prefix=CHECK-FTZ1-ARCH350
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-list=__CUDA_FTZ=0 -nvvm-reflect-list=__CUDA_ARCH=520 %s -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=1 -nvvm-reflect-add __CUDA_ARCH=350 %s -S | FileCheck %s --check-prefix=CHECK-FTZ1-ARCH350
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %s -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
 
 ; Verify that if we have module metadata that sets __CUDA_FTZ=1, that gets overridden by the command line arguments
 
 ; RUN: cat %s > %t.options
 ; RUN: echo '!llvm.module.flags = !{!0}' >> %t.options
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.options
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-list=__CUDA_FTZ=0,__CUDA_ARCH=520 %t.options -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %t.options -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
 
 define i32 @options() {
   %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @ftz to ptr))

>From 6fd4fa930ff2e29e79a333d9d5106a1bcf2d394e Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 22:05:03 +0000
Subject: [PATCH 11/38] final reflect cleanup

---
 llvm/lib/Target/NVPTX/NVPTX.h                |   6 +-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp |  23 ++-
 llvm/lib/Target/NVPTX/NVVMReflect.cpp        | 194 +++++++------------
 3 files changed, 99 insertions(+), 124 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index f98ace3a0d189..93b31f2a4c01d 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -78,12 +78,12 @@ struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
 };
 
 struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
-  NVVMReflectPass() : NVVMReflectPass(0) {}
-  NVVMReflectPass(unsigned SmVersion);
+  NVVMReflectPass() : SmVersion(0) {}
+  NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
   PreservedAnalyses run(Module &F, ModuleAnalysisManager &AM);
 
 private:
-  StringMap<int> VarMap;
+  unsigned SmVersion;
 };
 
 struct GenericToNVVMPass : PassInfoMixin<GenericToNVVMPass> {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index d4376ef87b1f1..8bc94846337cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -87,6 +87,27 @@ static cl::opt<bool> EarlyByValArgsCopy(
     cl::desc("Create a copy of byval function arguments early."),
     cl::init(false), cl::Hidden);
 
+namespace llvm {
+
+void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
+void initializeNVPTXAllocaHoistingPass(PassRegistry &);
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
+void initializeNVPTXAtomicLowerPass(PassRegistry &);
+void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
+void initializeNVPTXLowerAllocaPass(PassRegistry &);
+void initializeNVPTXLowerUnreachablePass(PassRegistry &);
+void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
+void initializeNVPTXLowerArgsLegacyPassPass(PassRegistry &);
+void initializeNVPTXProxyRegErasurePass(PassRegistry &);
+void initializeNVPTXForwardParamsPassPass(PassRegistry &);
+void initializeNVVMIntrRangePass(PassRegistry &);
+void initializeNVVMReflectLegacyPassPass(PassRegistry &);
+void initializeNVPTXAAWrapperPassPass(PassRegistry &);
+void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
+
+} // end namespace llvm
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   // Register the target.
   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
@@ -95,7 +116,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   // FIXME: This pass is really intended to be invoked during IR optimization,
   // but it's very NVPTX-specific.
-  initializeNVVMReflectPass(PR);
+  initializeNVVMReflectLegacyPassPass(PR);
   initializeNVVMIntrRangePass(PR);
   initializeGenericToNVVMLegacyPassPass(PR);
   initializeNVPTXAllocaHoistingPass(PR);
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index b1e01a1e3fe71..f0c80a9f8bd3d 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -41,44 +41,60 @@
 #include "llvm/ADT/StringExtras.h"
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
 #define NVVM_REFLECT_OCL_FUNCTION "__nvvm_reflect_ocl"
+// Argument of reflect call to retrive arch number
+#define CUDA_ARCH_NAME "__CUDA_ARCH"
+// Argument of reflect call to retrive ftz mode
+#define CUDA_FTZ_NAME "__CUDA_FTZ"
+// Name of module metadata where ftz mode is stored
+#define CUDA_FTZ_MODULE_NAME "nvvm-reflect-ftz"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "nvvm-reflect"
 
+namespace llvm {
+void initializeNVVMReflectLegacyPassPass(PassRegistry &);
+}
+
 namespace {
-class NVVMReflect : public ModulePass {
+class NVVMReflect {
 private:
-  StringMap<int> VarMap;
-  void handleReflectFunction(Function *F);
-  void setVarMap(Module &M);
+  // Map from reflect function call arguments to the value to replace the call with.
+  // Should include __CUDA_FTZ and __CUDA_ARCH values.
+  StringMap<int> ReflectMap;
+  bool handleReflectFunction(Module &M, StringRef ReflectName);
+  void populateReflectMap(Module &M);
   void foldReflectCall(CallInst *Call, Constant *NewValue);
 public:
-  static char ID;
-  NVVMReflect() : NVVMReflect(0) {}
   // __CUDA_FTZ is assigned in `runOnModule` by checking nvvm-reflect-ftz module
   // metadata.
-  explicit NVVMReflect(unsigned SmVersion) : ModulePass(ID), VarMap({{"__CUDA_ARCH", SmVersion * 10}}) {
-    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
-  }
-  // This mapping will contain should include __CUDA_FTZ and __CUDA_ARCH values.
-  explicit NVVMReflect(const StringMap<int> &Mapping) : ModulePass(ID), VarMap(Mapping) {
-    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+  explicit NVVMReflect(unsigned SmVersion) : ReflectMap({{CUDA_ARCH_NAME, SmVersion * 10}}) {}
+  bool runOnModule(Module &M);
+};
+} // namespace
+
+class NVVMReflectLegacyPass : public ModulePass {
+private:
+  NVVMReflect Impl;
+public:
+  static char ID;
+    NVVMReflectLegacyPass(unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {
+    initializeNVVMReflectLegacyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnModule(Module &M) override;
 };
-} // namespace
 
 ModulePass *llvm::createNVVMReflectPass(unsigned int SmVersion) {
-  return new NVVMReflect(SmVersion);
+  LLVM_DEBUG(dbgs() << "Creating NVVMReflectPass with SM version " << SmVersion << "\n");
+  return new NVVMReflectLegacyPass(SmVersion);
 }
 
 static cl::opt<bool>
     NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
                        cl::desc("NVVM reflection, enabled by default"));
 
-char NVVMReflect::ID = 0;
-INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
+char NVVMReflectLegacyPass::ID = 0;
+INITIALIZE_PASS(NVVMReflectLegacyPass, "nvvm-reflect",
                 "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                 false)
 
@@ -92,10 +108,10 @@ ReflectList("nvvm-reflect-add", cl::value_desc("name=<int>"), cl::Hidden,
 
 // Set the VarMap with, first, the value of __CUDA_FTZ from module metadata, and then
 // the key/value pairs from the command line.
-void NVVMReflect::setVarMap(Module &M) {
+void NVVMReflect::populateReflectMap(Module &M) {
   if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
-      M.getModuleFlag("nvvm-reflect-ftz")))
-    VarMap["__CUDA_FTZ"] = Flag->getSExtValue();
+      M.getModuleFlag(CUDA_FTZ_MODULE_NAME)))
+    ReflectMap[CUDA_FTZ_NAME] = Flag->getSExtValue();
 
   for (StringRef Option : ReflectList) {
     LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n");
@@ -107,94 +123,52 @@ void NVVMReflect::setVarMap(Module &M) {
     int ValInt;
     if (!to_integer(Val.trim(), ValInt, 10))
       report_fatal_error("integer value expected in nvvm-reflect-list option '" + Option + "'");
-    VarMap[Name] = ValInt;
+    ReflectMap[Name] = ValInt;
   }
 }
 
-/// Process a reflect function by finding all its uses and replacing them with
+/// Process a reflect function by finding all its calls and replacing them with
 /// appropriate constant values. For __CUDA_FTZ, uses the module flag value.
 /// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0.
-void NVVMReflect::handleReflectFunction(Function *F) {
-  // Validate _reflect function
+bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
+  Function *F = M.getFunction(ReflectName);
+  if (!F)
+    return false;
   assert(F->isDeclaration() && "_reflect function should not have a body");
   assert(F->getReturnType()->isIntegerTy() && "_reflect's return type should be integer");
 
-
-  // Go through the uses of the reflect function. Each use should be a CallInst
-  // with a ConstantArray argument. Replace the uses with the appropriate constant values.
-
-  // The IR for __nvvm_reflect calls differs between CUDA versions.
-  //
-  // CUDA 6.5 and earlier uses this sequence:
-  //    %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8
-  //        (i8 addrspace(4)* getelementptr inbounds
-  //           ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
-  //    %reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
-  //
-  // The value returned by Sym->getOperand(0) is a Constant with a
-  // ConstantDataSequential operand which can be converted to string and used
-  // for lookup.
-  //
-  // CUDA 7.0 does it slightly differently:
-  //   %reflect = call i32 @__nvvm_reflect(i8* addrspacecast
-  //        (i8 addrspace(1)* getelementptr inbounds
-  //           ([8 x i8], [8 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
-  //
-  // In this case, we get a Constant with a GlobalVariable operand and we need
-  // to dig deeper to find its initializer with the string we'll use for lookup.
-
+  bool Changed = F->getNumUses() > 0;
   for (User *U : make_early_inc_range(F->users())) {
+    // Reflect function calls look like:
+    // @arch = private unnamed_addr addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
+    // call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @arch to ptr))
+    // We need to extract the string argument from the call (i.e. "__CUDA_ARCH")
     if (!isa<CallInst>(U))
       report_fatal_error("__nvvm_reflect can only be used in a call instruction");
     CallInst *Call = cast<CallInst>(U);
-
     if (Call->getNumOperands() != 2)
       report_fatal_error("__nvvm_reflect requires exactly one argument");
 
-    // In cuda 6.5 and earlier, we will have an extra constant-to-generic
-    // conversion of the string.
-    const Value *Str = Call->getArgOperand(0);
-    if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) {
-      // Verify this is the constant-to-generic intrinsic
-      Function *Callee = ConvCall->getCalledFunction();
-      if (!Callee || !Callee->isIntrinsic() || 
-          !Callee->getName().starts_with("llvm.nvvm.ptr.constant.to.gen"))
-        report_fatal_error("Expected llvm.nvvm.ptr.constant.to.gen intrinsic");
-      if (ConvCall->getNumOperands() != 2)
-        report_fatal_error("Expected one argument for ptr conversion");
-      Str = ConvCall->getArgOperand(0);
-    }
-    // Pre opaque pointers we have a constant expression wrapping the constant
-    Str = Str->stripPointerCasts();
-    if (!isa<Constant>(Str))
+    const Value *GlobalStr = Call->getArgOperand(0)->stripPointerCasts();
+    if (!isa<Constant>(GlobalStr))
       report_fatal_error("__nvvm_reflect argument must be a constant string");
 
-    const Value *Operand = cast<Constant>(Str)->getOperand(0);
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) {
-      // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's
-      // initializer.
-      if (!GV->hasInitializer())
-        report_fatal_error("__nvvm_reflect string must have an initializer");
-      const Constant *Initializer = GV->getInitializer();
-      Operand = Initializer;
-    }
-
-    if (!isa<ConstantDataSequential>(Operand))
+    const Value *ConstantStr = cast<Constant>(GlobalStr)->getOperand(0);
+    if (!isa<ConstantDataSequential>(ConstantStr))
       report_fatal_error("__nvvm_reflect argument must be a string constant");
-    if (!cast<ConstantDataSequential>(Operand)->isCString())
+    if (!cast<ConstantDataSequential>(ConstantStr)->isCString())
       report_fatal_error("__nvvm_reflect argument must be a null-terminated string");
 
-    StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
+    StringRef ReflectArg = cast<ConstantDataSequential>(ConstantStr)->getAsString();
     // Remove the null terminator from the string
     ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
-
     if (ReflectArg.empty())
       report_fatal_error("__nvvm_reflect argument cannot be empty");
-
+    // Now that we have extracted the string argument, we can look it up in the VarMap
     int ReflectVal = 0; // The default value is 0
-    if (VarMap.contains(ReflectArg)) {
-      ReflectVal = VarMap[ReflectArg];
-    }
+    if (ReflectMap.contains(ReflectArg))
+      ReflectVal = ReflectMap[ReflectArg];
+
     LLVM_DEBUG(dbgs() << "Replacing call of reflect function " << F->getName() << "(" << ReflectArg << ") with value " << ReflectVal << "\n");
     Constant *NewValue = ConstantInt::get(Call->getType(), ReflectVal);
     foldReflectCall(Call, NewValue);
@@ -203,29 +177,26 @@ void NVVMReflect::handleReflectFunction(Function *F) {
 
   // Remove the __nvvm_reflect function from the module
   F->eraseFromParent();
+  return Changed;
 }
 
 void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
-  // Initialize worklist with all users of the call
   SmallVector<Instruction*, 8> Worklist;
-  for (User *U : Call->users()) {
-    if (Instruction *I = dyn_cast<Instruction>(U)) {
-      Worklist.push_back(I);
+  // Replace an instruction with a constant and add all users of the instruction to the worklist
+  auto ReplaceInstructionWithConst = [&](Instruction *I, Constant *C) {
+    for (User *U : I->users()) {
+      if (Instruction *UI = dyn_cast<Instruction>(U))
+        Worklist.push_back(UI);
     }
-  }
+    I->replaceAllUsesWith(C);
+  };
 
-  Call->replaceAllUsesWith(NewValue);
+  ReplaceInstructionWithConst(Call, NewValue);
 
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
     if (Constant *C = ConstantFoldInstruction(I, Call->getModule()->getDataLayout())) {
-      // Add all users of this instruction to the worklist, replace it with the constant
-      // then delete it if it's dead
-      for (User *U : I->users()) {
-        if (Instruction *UI = dyn_cast<Instruction>(U))
-          Worklist.push_back(UI);
-      }
-      I->replaceAllUsesWith(C);
+      ReplaceInstructionWithConst(I, C);
       if (isInstructionTriviallyDead(I))
         I->eraseFromParent();
     } else if (I->isTerminator()) {
@@ -237,37 +208,20 @@ void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
 bool NVVMReflect::runOnModule(Module &M) {
   if (!NVVMReflectEnabled)
     return false;
-
-  setVarMap(M);
-
-  bool Changed = false;
-  // Names of reflect function to find and replace
-  SmallVector<StringRef, 5> ReflectNames = {
-      NVVM_REFLECT_FUNCTION,
-      NVVM_REFLECT_OCL_FUNCTION,
-      Intrinsic::getName(Intrinsic::nvvm_reflect),
-  };
-
-  // Process all reflect functions
-  for (StringRef Name : ReflectNames) {
-    if (Function *ReflectFunction = M.getFunction(Name)) {
-      // If the reflect functition is called, we need to replace the call
-      // with the appropriate constant, modifying the IR.
-      Changed |= ReflectFunction->getNumUses() > 0;
-      handleReflectFunction(ReflectFunction);
-    }
-  }
-
+  populateReflectMap(M);
+  bool Changed = true;
+  handleReflectFunction(M, NVVM_REFLECT_FUNCTION);
+  handleReflectFunction(M, NVVM_REFLECT_OCL_FUNCTION);
+  handleReflectFunction(M, Intrinsic::getName(Intrinsic::nvvm_reflect));
   return Changed;
 }
 
-// Implementations for the pass that works with the new pass manager.
-NVVMReflectPass::NVVMReflectPass(unsigned SmVersion) {
-  VarMap["__CUDA_ARCH"] = SmVersion * 10;
+bool NVVMReflectLegacyPass::runOnModule(Module &M) {
+  return Impl.runOnModule(M);
 }
 
 PreservedAnalyses NVVMReflectPass::run(Module &M,
                                     ModuleAnalysisManager &AM) {
-  return NVVMReflect(VarMap).runOnModule(M) ? PreservedAnalyses::none()
+  return NVVMReflect(SmVersion).runOnModule(M) ? PreservedAnalyses::none()
                                    : PreservedAnalyses::all();
 }
\ No newline at end of file

>From cbf8664c870159f55ce372ccf64417c0a411850d Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 22:22:43 +0000
Subject: [PATCH 12/38] final reflect cleanup

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index f0c80a9f8bd3d..3c22798e6504e 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -73,16 +73,16 @@ class NVVMReflect {
 };
 } // namespace
 
+namespace {
 class NVVMReflectLegacyPass : public ModulePass {
 private:
   NVVMReflect Impl;
 public:
   static char ID;
-    NVVMReflectLegacyPass(unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {
-    initializeNVVMReflectLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
+    NVVMReflectLegacyPass(unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {}
   bool runOnModule(Module &M) override;
 };
+} // namespace
 
 ModulePass *llvm::createNVVMReflectPass(unsigned int SmVersion) {
   LLVM_DEBUG(dbgs() << "Creating NVVMReflectPass with SM version " << SmVersion << "\n");

>From 790b4bbd79f0ee5a1af5cb44b84315f2ef7792de Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 22:29:40 +0000
Subject: [PATCH 13/38] clang format

---
 llvm/lib/Target/NVPTX/NVPTX.h                | 35 ++------
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 23 +----
 llvm/lib/Target/NVPTX/NVVMReflect.cpp        | 92 ++++++++++++--------
 3 files changed, 63 insertions(+), 87 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 93b31f2a4c01d..be1f9c380b680 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -27,14 +27,7 @@ class NVPTXTargetMachine;
 class PassRegistry;
 
 namespace NVPTXCC {
-enum CondCodes {
-  EQ,
-  NE,
-  LT,
-  LE,
-  GT,
-  GE
-};
+enum CondCodes { EQ, NE, LT, LE, GT, GE };
 }
 
 FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
@@ -55,6 +48,7 @@ MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 MachineFunctionPass *createNVPTXForwardParamsPass();
 
+void initializeNVVMReflectLegacyPassPass(PassRegistry &);
 void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
@@ -104,10 +98,7 @@ struct NVPTXLowerArgsPass : PassInfoMixin<NVPTXLowerArgsPass> {
 };
 
 namespace NVPTX {
-enum DrvInterface {
-  NVCL,
-  CUDA
-};
+enum DrvInterface { NVCL, CUDA };
 
 // A field inside TSFlags needs a shift and a mask. The usage is
 // always as follows :
@@ -129,10 +120,7 @@ enum VecInstType {
   VecOther = 15
 };
 
-enum SimpleMove {
-  SimpleMoveMask = 0x10,
-  SimpleMoveShift = 4
-};
+enum SimpleMove { SimpleMoveMask = 0x10, SimpleMoveShift = 4 };
 enum LoadStore {
   isLoadMask = 0x20,
   isLoadShift = 5,
@@ -181,17 +169,8 @@ enum AddressSpace : AddressSpaceUnderlyingType {
 };
 
 namespace PTXLdStInstCode {
-enum FromType {
-  Unsigned = 0,
-  Signed,
-  Float,
-  Untyped
-};
-enum VecType {
-  Scalar = 1,
-  V2 = 2,
-  V4 = 4
-};
+enum FromType { Unsigned = 0, Signed, Float, Untyped };
+enum VecType { Scalar = 1, V2 = 2, V4 = 4 };
 } // namespace PTXLdStInstCode
 
 /// PTXCvtMode - Conversion code enumeration
@@ -254,7 +233,7 @@ enum PrmtMode {
   RC16,
 };
 }
-}
+} // namespace NVPTX
 void initializeNVPTXDAGToDAGISelLegacyPass(PassRegistry &);
 } // namespace llvm
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 8bc94846337cf..a4c3b43aec9f2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -87,27 +87,6 @@ static cl::opt<bool> EarlyByValArgsCopy(
     cl::desc("Create a copy of byval function arguments early."),
     cl::init(false), cl::Hidden);
 
-namespace llvm {
-
-void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
-void initializeNVPTXAllocaHoistingPass(PassRegistry &);
-void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
-void initializeNVPTXAtomicLowerPass(PassRegistry &);
-void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
-void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
-void initializeNVPTXLowerAllocaPass(PassRegistry &);
-void initializeNVPTXLowerUnreachablePass(PassRegistry &);
-void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
-void initializeNVPTXLowerArgsLegacyPassPass(PassRegistry &);
-void initializeNVPTXProxyRegErasurePass(PassRegistry &);
-void initializeNVPTXForwardParamsPassPass(PassRegistry &);
-void initializeNVVMIntrRangePass(PassRegistry &);
-void initializeNVVMReflectLegacyPassPass(PassRegistry &);
-void initializeNVPTXAAWrapperPassPass(PassRegistry &);
-void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
-
-} // end namespace llvm
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   // Register the target.
   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
@@ -265,7 +244,7 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // has not provided a target architecture just yet.
         if (Subtarget.hasTargetName())
           PM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
-        
+
         FunctionPassManager FPM;
         // Note: NVVMIntrRangePass was causing numerical discrepancies at one
         // point, if issues crop up, consider disabling.
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 3c22798e6504e..3ea6695c3d5f1 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -20,6 +20,7 @@
 
 #include "NVPTX.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/Constants.h"
@@ -38,7 +39,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/StringExtras.h"
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
 #define NVVM_REFLECT_OCL_FUNCTION "__nvvm_reflect_ocl"
 // Argument of reflect call to retrive arch number
@@ -54,21 +54,23 @@ using namespace llvm;
 
 namespace llvm {
 void initializeNVVMReflectLegacyPassPass(PassRegistry &);
-}
+} // namespace llvm
 
 namespace {
 class NVVMReflect {
 private:
-  // Map from reflect function call arguments to the value to replace the call with.
-  // Should include __CUDA_FTZ and __CUDA_ARCH values.
+  // Map from reflect function call arguments to the value to replace the call
+  // with. Should include __CUDA_FTZ and __CUDA_ARCH values.
   StringMap<int> ReflectMap;
   bool handleReflectFunction(Module &M, StringRef ReflectName);
   void populateReflectMap(Module &M);
   void foldReflectCall(CallInst *Call, Constant *NewValue);
+
 public:
   // __CUDA_FTZ is assigned in `runOnModule` by checking nvvm-reflect-ftz module
   // metadata.
-  explicit NVVMReflect(unsigned SmVersion) : ReflectMap({{CUDA_ARCH_NAME, SmVersion * 10}}) {}
+  explicit NVVMReflect(unsigned SmVersion)
+      : ReflectMap({{CUDA_ARCH_NAME, SmVersion * 10}}) {}
   bool runOnModule(Module &M);
 };
 } // namespace
@@ -77,15 +79,17 @@ namespace {
 class NVVMReflectLegacyPass : public ModulePass {
 private:
   NVVMReflect Impl;
+
 public:
   static char ID;
-    NVVMReflectLegacyPass(unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {}
+  NVVMReflectLegacyPass(unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {}
   bool runOnModule(Module &M) override;
 };
 } // namespace
 
 ModulePass *llvm::createNVVMReflectPass(unsigned int SmVersion) {
-  LLVM_DEBUG(dbgs() << "Creating NVVMReflectPass with SM version " << SmVersion << "\n");
+  LLVM_DEBUG(dbgs() << "Creating NVVMReflectPass with SM version " << SmVersion
+                    << "\n");
   return new NVVMReflectLegacyPass(SmVersion);
 }
 
@@ -98,31 +102,36 @@ INITIALIZE_PASS(NVVMReflectLegacyPass, "nvvm-reflect",
                 "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                 false)
 
-// Allow users to specify additional key/value pairs to reflect. These key/value pairs
-// are the last to be added to the VarMap, and therefore will take precedence over initial
-// values (i.e. __CUDA_FTZ from module medadata and __CUDA_ARCH from SmVersion).
-static cl::list<std::string>
-ReflectList("nvvm-reflect-add", cl::value_desc("name=<int>"), cl::Hidden,
-            cl::desc("A key=value pair. Replace __nvvm_reflect(name) with value."),
-            cl::ValueRequired);
-
-// Set the VarMap with, first, the value of __CUDA_FTZ from module metadata, and then
-// the key/value pairs from the command line.
+// Allow users to specify additional key/value pairs to reflect. These key/value
+// pairs are the last to be added to the VarMap, and therefore will take
+// precedence over initial values (i.e. __CUDA_FTZ from module medadata and
+// __CUDA_ARCH from SmVersion).
+static cl::list<std::string> ReflectList(
+    "nvvm-reflect-add", cl::value_desc("name=<int>"), cl::Hidden,
+    cl::desc("A key=value pair. Replace __nvvm_reflect(name) with value."),
+    cl::ValueRequired);
+
+// Set the VarMap with, first, the value of __CUDA_FTZ from module metadata, and
+// then the key/value pairs from the command line.
 void NVVMReflect::populateReflectMap(Module &M) {
   if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
-      M.getModuleFlag(CUDA_FTZ_MODULE_NAME)))
+          M.getModuleFlag(CUDA_FTZ_MODULE_NAME)))
     ReflectMap[CUDA_FTZ_NAME] = Flag->getSExtValue();
 
   for (StringRef Option : ReflectList) {
     LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n");
     auto [Name, Val] = Option.split('=');
     if (Name.empty())
-      report_fatal_error("Empty name in nvvm-reflect-list option '" + Option + "'");
-    if (Val.empty()) 
-      report_fatal_error("Missing value in nvvm-reflect- option '" + Option + "'");
+      report_fatal_error("Empty name in nvvm-reflect-list option '" + Option +
+                         "'");
+    if (Val.empty())
+      report_fatal_error("Missing value in nvvm-reflect- option '" + Option +
+                         "'");
     int ValInt;
     if (!to_integer(Val.trim(), ValInt, 10))
-      report_fatal_error("integer value expected in nvvm-reflect-list option '" + Option + "'");
+      report_fatal_error(
+          "integer value expected in nvvm-reflect-list option '" + Option +
+          "'");
     ReflectMap[Name] = ValInt;
   }
 }
@@ -135,16 +144,19 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
   if (!F)
     return false;
   assert(F->isDeclaration() && "_reflect function should not have a body");
-  assert(F->getReturnType()->isIntegerTy() && "_reflect's return type should be integer");
+  assert(F->getReturnType()->isIntegerTy() &&
+         "_reflect's return type should be integer");
 
   bool Changed = F->getNumUses() > 0;
   for (User *U : make_early_inc_range(F->users())) {
     // Reflect function calls look like:
-    // @arch = private unnamed_addr addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
-    // call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @arch to ptr))
-    // We need to extract the string argument from the call (i.e. "__CUDA_ARCH")
+    // @arch = private unnamed_addr addrspace(1) constant [12 x i8]
+    // c"__CUDA_ARCH\00" call i32 @__nvvm_reflect(ptr addrspacecast (ptr
+    // addrspace(1) @arch to ptr)) We need to extract the string argument from
+    // the call (i.e. "__CUDA_ARCH")
     if (!isa<CallInst>(U))
-      report_fatal_error("__nvvm_reflect can only be used in a call instruction");
+      report_fatal_error(
+          "__nvvm_reflect can only be used in a call instruction");
     CallInst *Call = cast<CallInst>(U);
     if (Call->getNumOperands() != 2)
       report_fatal_error("__nvvm_reflect requires exactly one argument");
@@ -157,19 +169,24 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
     if (!isa<ConstantDataSequential>(ConstantStr))
       report_fatal_error("__nvvm_reflect argument must be a string constant");
     if (!cast<ConstantDataSequential>(ConstantStr)->isCString())
-      report_fatal_error("__nvvm_reflect argument must be a null-terminated string");
+      report_fatal_error(
+          "__nvvm_reflect argument must be a null-terminated string");
 
-    StringRef ReflectArg = cast<ConstantDataSequential>(ConstantStr)->getAsString();
+    StringRef ReflectArg =
+        cast<ConstantDataSequential>(ConstantStr)->getAsString();
     // Remove the null terminator from the string
     ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
     if (ReflectArg.empty())
       report_fatal_error("__nvvm_reflect argument cannot be empty");
-    // Now that we have extracted the string argument, we can look it up in the VarMap
+    // Now that we have extracted the string argument, we can look it up in the
+    // VarMap
     int ReflectVal = 0; // The default value is 0
     if (ReflectMap.contains(ReflectArg))
       ReflectVal = ReflectMap[ReflectArg];
 
-    LLVM_DEBUG(dbgs() << "Replacing call of reflect function " << F->getName() << "(" << ReflectArg << ") with value " << ReflectVal << "\n");
+    LLVM_DEBUG(dbgs() << "Replacing call of reflect function " << F->getName()
+                      << "(" << ReflectArg << ") with value " << ReflectVal
+                      << "\n");
     Constant *NewValue = ConstantInt::get(Call->getType(), ReflectVal);
     foldReflectCall(Call, NewValue);
     Call->eraseFromParent();
@@ -181,8 +198,9 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
 }
 
 void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
-  SmallVector<Instruction*, 8> Worklist;
-  // Replace an instruction with a constant and add all users of the instruction to the worklist
+  SmallVector<Instruction *, 8> Worklist;
+  // Replace an instruction with a constant and add all users of the instruction
+  // to the worklist
   auto ReplaceInstructionWithConst = [&](Instruction *I, Constant *C) {
     for (User *U : I->users()) {
       if (Instruction *UI = dyn_cast<Instruction>(U))
@@ -195,7 +213,8 @@ void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
 
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
-    if (Constant *C = ConstantFoldInstruction(I, Call->getModule()->getDataLayout())) {
+    if (Constant *C =
+            ConstantFoldInstruction(I, Call->getModule()->getDataLayout())) {
       ReplaceInstructionWithConst(I, C);
       if (isInstructionTriviallyDead(I))
         I->eraseFromParent();
@@ -220,8 +239,7 @@ bool NVVMReflectLegacyPass::runOnModule(Module &M) {
   return Impl.runOnModule(M);
 }
 
-PreservedAnalyses NVVMReflectPass::run(Module &M,
-                                    ModuleAnalysisManager &AM) {
+PreservedAnalyses NVVMReflectPass::run(Module &M, ModuleAnalysisManager &AM) {
   return NVVMReflect(SmVersion).runOnModule(M) ? PreservedAnalyses::none()
-                                   : PreservedAnalyses::all();
+                                               : PreservedAnalyses::all();
 }
\ No newline at end of file

>From a7a42ffaeeebaa1e11b96a3e23f46323a2e10b7d Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 22:37:12 +0000
Subject: [PATCH 14/38] forgot to set Changed variable

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 3ea6695c3d5f1..e79db8083abcc 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -229,9 +229,9 @@ bool NVVMReflect::runOnModule(Module &M) {
     return false;
   populateReflectMap(M);
   bool Changed = true;
-  handleReflectFunction(M, NVVM_REFLECT_FUNCTION);
-  handleReflectFunction(M, NVVM_REFLECT_OCL_FUNCTION);
-  handleReflectFunction(M, Intrinsic::getName(Intrinsic::nvvm_reflect));
+  Changed |= handleReflectFunction(M, NVVM_REFLECT_FUNCTION);
+  Changed |= handleReflectFunction(M, NVVM_REFLECT_OCL_FUNCTION);
+  Changed |= handleReflectFunction(M, Intrinsic::getName(Intrinsic::nvvm_reflect));
   return Changed;
 }
 

>From b600155691f613762609a17c6d231fe9faa6766e Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 22:39:20 +0000
Subject: [PATCH 15/38] typo in comment

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index e79db8083abcc..4b23285ea79e9 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -179,7 +179,7 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
     if (ReflectArg.empty())
       report_fatal_error("__nvvm_reflect argument cannot be empty");
     // Now that we have extracted the string argument, we can look it up in the
-    // VarMap
+    // ReflectMap
     int ReflectVal = 0; // The default value is 0
     if (ReflectMap.contains(ReflectArg))
       ReflectVal = ReflectMap[ReflectArg];

>From 3d8ce91180ea3c2566fef35fce2336a9d801fd89 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 23:29:43 +0000
Subject: [PATCH 16/38] final cleanup

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 4b23285ea79e9..f9f16aecd9a1f 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -58,7 +58,6 @@ void initializeNVVMReflectLegacyPassPass(PassRegistry &);
 
 namespace {
 class NVVMReflect {
-private:
   // Map from reflect function call arguments to the value to replace the call
   // with. Should include __CUDA_FTZ and __CUDA_ARCH values.
   StringMap<int> ReflectMap;
@@ -73,11 +72,8 @@ class NVVMReflect {
       : ReflectMap({{CUDA_ARCH_NAME, SmVersion * 10}}) {}
   bool runOnModule(Module &M);
 };
-} // namespace
 
-namespace {
 class NVVMReflectLegacyPass : public ModulePass {
-private:
   NVVMReflect Impl;
 
 public:
@@ -122,15 +118,15 @@ void NVVMReflect::populateReflectMap(Module &M) {
     LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n");
     auto [Name, Val] = Option.split('=');
     if (Name.empty())
-      report_fatal_error("Empty name in nvvm-reflect-list option '" + Option +
+      report_fatal_error("Empty name in nvvm-reflect-add option '" + Option +
                          "'");
     if (Val.empty())
-      report_fatal_error("Missing value in nvvm-reflect- option '" + Option +
+      report_fatal_error("Missing value in nvvm-reflect-add option '" + Option +
                          "'");
     int ValInt;
     if (!to_integer(Val.trim(), ValInt, 10))
       report_fatal_error(
-          "integer value expected in nvvm-reflect-list option '" + Option +
+          "integer value expected in nvvm-reflect-add option '" + Option +
           "'");
     ReflectMap[Name] = ValInt;
   }
@@ -154,10 +150,10 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
     // c"__CUDA_ARCH\00" call i32 @__nvvm_reflect(ptr addrspacecast (ptr
     // addrspace(1) @arch to ptr)) We need to extract the string argument from
     // the call (i.e. "__CUDA_ARCH")
-    if (!isa<CallInst>(U))
+    CallInst *Call = dyn_cast<CallInst>(U);
+    if (!Call)
       report_fatal_error(
           "__nvvm_reflect can only be used in a call instruction");
-    CallInst *Call = cast<CallInst>(U);
     if (Call->getNumOperands() != 2)
       report_fatal_error("__nvvm_reflect requires exactly one argument");
 
@@ -173,9 +169,7 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
           "__nvvm_reflect argument must be a null-terminated string");
 
     StringRef ReflectArg =
-        cast<ConstantDataSequential>(ConstantStr)->getAsString();
-    // Remove the null terminator from the string
-    ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
+        cast<ConstantDataSequential>(ConstantStr)->getAsString().drop_back();
     if (ReflectArg.empty())
       report_fatal_error("__nvvm_reflect argument cannot be empty");
     // Now that we have extracted the string argument, we can look it up in the
@@ -202,19 +196,18 @@ void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
   // Replace an instruction with a constant and add all users of the instruction
   // to the worklist
   auto ReplaceInstructionWithConst = [&](Instruction *I, Constant *C) {
-    for (User *U : I->users()) {
+    for (User *U : I->users())
       if (Instruction *UI = dyn_cast<Instruction>(U))
         Worklist.push_back(UI);
-    }
     I->replaceAllUsesWith(C);
   };
 
   ReplaceInstructionWithConst(Call, NewValue);
 
+  auto DL = Call->getModule()->getDataLayout();
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
-    if (Constant *C =
-            ConstantFoldInstruction(I, Call->getModule()->getDataLayout())) {
+    if (Constant *C = ConstantFoldInstruction(I, DL)) {
       ReplaceInstructionWithConst(I, C);
       if (isInstructionTriviallyDead(I))
         I->eraseFromParent();

>From 2dd7e1ab80a8447825ff1e41141ed7b9bd0f2cca Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 23:32:52 +0000
Subject: [PATCH 17/38] formatting

---
 llvm/lib/Target/NVPTX/NVPTX.h         | 25 ++++++++++++++++++++-----
 llvm/lib/Target/NVPTX/NVVMReflect.cpp |  8 ++++----
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index be1f9c380b680..9ce04e3a0db0c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -98,7 +98,10 @@ struct NVPTXLowerArgsPass : PassInfoMixin<NVPTXLowerArgsPass> {
 };
 
 namespace NVPTX {
-enum DrvInterface { NVCL, CUDA };
+enum DrvInterface {
+  NVCL,
+  CUDA
+};
 
 // A field inside TSFlags needs a shift and a mask. The usage is
 // always as follows :
@@ -120,7 +123,10 @@ enum VecInstType {
   VecOther = 15
 };
 
-enum SimpleMove { SimpleMoveMask = 0x10, SimpleMoveShift = 4 };
+enum SimpleMove {
+  SimpleMoveMask = 0x10,
+  SimpleMoveShift = 4
+};
 enum LoadStore {
   isLoadMask = 0x20,
   isLoadShift = 5,
@@ -169,8 +175,17 @@ enum AddressSpace : AddressSpaceUnderlyingType {
 };
 
 namespace PTXLdStInstCode {
-enum FromType { Unsigned = 0, Signed, Float, Untyped };
-enum VecType { Scalar = 1, V2 = 2, V4 = 4 };
+enum FromType {
+  Unsigned = 0,
+  Signed,
+  Float,
+  Untyped
+};
+enum VecType {
+  Scalar = 1,
+  V2 = 2,
+  V4 = 4
+};
 } // namespace PTXLdStInstCode
 
 /// PTXCvtMode - Conversion code enumeration
@@ -233,7 +248,7 @@ enum PrmtMode {
   RC16,
 };
 }
-} // namespace NVPTX
+}
 void initializeNVPTXDAGToDAGISelLegacyPass(PassRegistry &);
 } // namespace llvm
 
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index f9f16aecd9a1f..19d6a55bd44b2 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -125,9 +125,8 @@ void NVVMReflect::populateReflectMap(Module &M) {
                          "'");
     int ValInt;
     if (!to_integer(Val.trim(), ValInt, 10))
-      report_fatal_error(
-          "integer value expected in nvvm-reflect-add option '" + Option +
-          "'");
+      report_fatal_error("integer value expected in nvvm-reflect-add option '" +
+                         Option + "'");
     ReflectMap[Name] = ValInt;
   }
 }
@@ -224,7 +223,8 @@ bool NVVMReflect::runOnModule(Module &M) {
   bool Changed = true;
   Changed |= handleReflectFunction(M, NVVM_REFLECT_FUNCTION);
   Changed |= handleReflectFunction(M, NVVM_REFLECT_OCL_FUNCTION);
-  Changed |= handleReflectFunction(M, Intrinsic::getName(Intrinsic::nvvm_reflect));
+  Changed |=
+      handleReflectFunction(M, Intrinsic::getName(Intrinsic::nvvm_reflect));
   return Changed;
 }
 

>From 5301f9ad9705a6641893848e3f4f9ab74bf92693 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 23:33:34 +0000
Subject: [PATCH 18/38] formatting

---
 llvm/lib/Target/NVPTX/NVPTX.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 9ce04e3a0db0c..98e77ca80b8d5 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -27,7 +27,14 @@ class NVPTXTargetMachine;
 class PassRegistry;
 
 namespace NVPTXCC {
-enum CondCodes { EQ, NE, LT, LE, GT, GE };
+enum CondCodes {
+  EQ,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE
+};
 }
 
 FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,

>From 0a89dc063f8d4ce3faa7960b8a373adabea9e3f4 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 8 Apr 2025 23:36:51 +0000
Subject: [PATCH 19/38] int -> unsigned

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 19d6a55bd44b2..5d98489a6ffca 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -60,7 +60,7 @@ namespace {
 class NVVMReflect {
   // Map from reflect function call arguments to the value to replace the call
   // with. Should include __CUDA_FTZ and __CUDA_ARCH values.
-  StringMap<int> ReflectMap;
+  StringMap<unsigned> ReflectMap;
   bool handleReflectFunction(Module &M, StringRef ReflectName);
   void populateReflectMap(Module &M);
   void foldReflectCall(CallInst *Call, Constant *NewValue);
@@ -83,7 +83,7 @@ class NVVMReflectLegacyPass : public ModulePass {
 };
 } // namespace
 
-ModulePass *llvm::createNVVMReflectPass(unsigned int SmVersion) {
+ModulePass *llvm::createNVVMReflectPass(unsigned SmVersion) {
   LLVM_DEBUG(dbgs() << "Creating NVVMReflectPass with SM version " << SmVersion
                     << "\n");
   return new NVVMReflectLegacyPass(SmVersion);
@@ -123,7 +123,7 @@ void NVVMReflect::populateReflectMap(Module &M) {
     if (Val.empty())
       report_fatal_error("Missing value in nvvm-reflect-add option '" + Option +
                          "'");
-    int ValInt;
+    unsigned ValInt;
     if (!to_integer(Val.trim(), ValInt, 10))
       report_fatal_error("integer value expected in nvvm-reflect-add option '" +
                          Option + "'");
@@ -173,7 +173,7 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
       report_fatal_error("__nvvm_reflect argument cannot be empty");
     // Now that we have extracted the string argument, we can look it up in the
     // ReflectMap
-    int ReflectVal = 0; // The default value is 0
+    unsigned ReflectVal = 0; // The default value is 0
     if (ReflectMap.contains(ReflectArg))
       ReflectVal = ReflectMap[ReflectArg];
 

>From 966fcd9603be2dbe9a28dde6262fda0d74cae601 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 9 Apr 2025 00:28:32 +0000
Subject: [PATCH 20/38] const correct probably

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 72 +++++++++++++--------------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 5d98489a6ffca..87d996ebf50c0 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -61,14 +61,14 @@ class NVVMReflect {
   // Map from reflect function call arguments to the value to replace the call
   // with. Should include __CUDA_FTZ and __CUDA_ARCH values.
   StringMap<unsigned> ReflectMap;
-  bool handleReflectFunction(Module &M, StringRef ReflectName);
+  bool handleReflectFunction(Module &M, const StringRef ReflectName);
   void populateReflectMap(Module &M);
-  void foldReflectCall(CallInst *Call, Constant *NewValue);
+  void foldReflectCall(CallInst *const Call, Constant *const NewValue);
 
 public:
   // __CUDA_FTZ is assigned in `runOnModule` by checking nvvm-reflect-ftz module
   // metadata.
-  explicit NVVMReflect(unsigned SmVersion)
+  explicit NVVMReflect(const unsigned SmVersion)
       : ReflectMap({{CUDA_ARCH_NAME, SmVersion * 10}}) {}
   bool runOnModule(Module &M);
 };
@@ -78,12 +78,12 @@ class NVVMReflectLegacyPass : public ModulePass {
 
 public:
   static char ID;
-  NVVMReflectLegacyPass(unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {}
+  NVVMReflectLegacyPass(const unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {}
   bool runOnModule(Module &M) override;
 };
 } // namespace
 
-ModulePass *llvm::createNVVMReflectPass(unsigned SmVersion) {
+ModulePass *llvm::createNVVMReflectPass(const unsigned SmVersion) {
   LLVM_DEBUG(dbgs() << "Creating NVVMReflectPass with SM version " << SmVersion
                     << "\n");
   return new NVVMReflectLegacyPass(SmVersion);
@@ -110,23 +110,21 @@ static cl::list<std::string> ReflectList(
 // Set the VarMap with, first, the value of __CUDA_FTZ from module metadata, and
 // then the key/value pairs from the command line.
 void NVVMReflect::populateReflectMap(Module &M) {
-  if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+  if (const auto *const Flag = mdconst::extract_or_null<ConstantInt>(
           M.getModuleFlag(CUDA_FTZ_MODULE_NAME)))
     ReflectMap[CUDA_FTZ_NAME] = Flag->getSExtValue();
 
-  for (StringRef Option : ReflectList) {
+  for (const auto &Option : ReflectList) {
     LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n");
-    auto [Name, Val] = Option.split('=');
+    StringRef OptionRef(Option);
+    auto [Name, Val] = OptionRef.split('=');
     if (Name.empty())
-      report_fatal_error("Empty name in nvvm-reflect-add option '" + Option +
-                         "'");
+      report_fatal_error(Twine("Empty name in nvvm-reflect-add option '") + Option + "'");
     if (Val.empty())
-      report_fatal_error("Missing value in nvvm-reflect-add option '" + Option +
-                         "'");
+      report_fatal_error(Twine("Missing value in nvvm-reflect-add option '") + Option + "'");
     unsigned ValInt;
     if (!to_integer(Val.trim(), ValInt, 10))
-      report_fatal_error("integer value expected in nvvm-reflect-add option '" +
-                         Option + "'");
+      report_fatal_error(Twine("integer value expected in nvvm-reflect-add option '") + Option + "'");
     ReflectMap[Name] = ValInt;
   }
 }
@@ -134,41 +132,39 @@ void NVVMReflect::populateReflectMap(Module &M) {
 /// Process a reflect function by finding all its calls and replacing them with
 /// appropriate constant values. For __CUDA_FTZ, uses the module flag value.
 /// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0.
-bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
-  Function *F = M.getFunction(ReflectName);
+bool NVVMReflect::handleReflectFunction(Module &M, const StringRef ReflectName) {
+  Function *const F = M.getFunction(ReflectName);
   if (!F)
     return false;
   assert(F->isDeclaration() && "_reflect function should not have a body");
   assert(F->getReturnType()->isIntegerTy() &&
          "_reflect's return type should be integer");
 
-  bool Changed = F->getNumUses() > 0;
-  for (User *U : make_early_inc_range(F->users())) {
+  const bool Changed = F->getNumUses() > 0;
+  for (User *const U : make_early_inc_range(F->users())) {
     // Reflect function calls look like:
     // @arch = private unnamed_addr addrspace(1) constant [12 x i8]
     // c"__CUDA_ARCH\00" call i32 @__nvvm_reflect(ptr addrspacecast (ptr
     // addrspace(1) @arch to ptr)) We need to extract the string argument from
     // the call (i.e. "__CUDA_ARCH")
-    CallInst *Call = dyn_cast<CallInst>(U);
+    auto *const Call = dyn_cast<CallInst>(U);
     if (!Call)
-      report_fatal_error(
-          "__nvvm_reflect can only be used in a call instruction");
+      report_fatal_error("__nvvm_reflect can only be used in a call instruction");
     if (Call->getNumOperands() != 2)
       report_fatal_error("__nvvm_reflect requires exactly one argument");
 
-    const Value *GlobalStr = Call->getArgOperand(0)->stripPointerCasts();
-    if (!isa<Constant>(GlobalStr))
+    const auto *const GlobalStr =
+        dyn_cast<Constant>(Call->getArgOperand(0)->stripPointerCasts());
+    if (!GlobalStr)
       report_fatal_error("__nvvm_reflect argument must be a constant string");
 
-    const Value *ConstantStr = cast<Constant>(GlobalStr)->getOperand(0);
-    if (!isa<ConstantDataSequential>(ConstantStr))
+    const auto *const ConstantStr = dyn_cast<ConstantDataSequential>(GlobalStr);
+    if (!ConstantStr)
       report_fatal_error("__nvvm_reflect argument must be a string constant");
-    if (!cast<ConstantDataSequential>(ConstantStr)->isCString())
-      report_fatal_error(
-          "__nvvm_reflect argument must be a null-terminated string");
+    if (!ConstantStr->isCString())
+      report_fatal_error("__nvvm_reflect argument must be a null-terminated string");
 
-    StringRef ReflectArg =
-        cast<ConstantDataSequential>(ConstantStr)->getAsString().drop_back();
+    const StringRef ReflectArg = ConstantStr->getAsString().drop_back();
     if (ReflectArg.empty())
       report_fatal_error("__nvvm_reflect argument cannot be empty");
     // Now that we have extracted the string argument, we can look it up in the
@@ -180,7 +176,7 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
     LLVM_DEBUG(dbgs() << "Replacing call of reflect function " << F->getName()
                       << "(" << ReflectArg << ") with value " << ReflectVal
                       << "\n");
-    Constant *NewValue = ConstantInt::get(Call->getType(), ReflectVal);
+    auto *const NewValue = ConstantInt::get(Call->getType(), ReflectVal);
     foldReflectCall(Call, NewValue);
     Call->eraseFromParent();
   }
@@ -190,23 +186,23 @@ bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
   return Changed;
 }
 
-void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
+void NVVMReflect::foldReflectCall(CallInst *const Call, Constant *const NewValue) {
   SmallVector<Instruction *, 8> Worklist;
   // Replace an instruction with a constant and add all users of the instruction
   // to the worklist
-  auto ReplaceInstructionWithConst = [&](Instruction *I, Constant *C) {
-    for (User *U : I->users())
-      if (Instruction *UI = dyn_cast<Instruction>(U))
+  auto ReplaceInstructionWithConst = [&](Instruction *const I, Constant *const C) {
+    for (auto *const U : I->users())
+      if (auto *const UI = dyn_cast<Instruction>(U))
         Worklist.push_back(UI);
     I->replaceAllUsesWith(C);
   };
 
   ReplaceInstructionWithConst(Call, NewValue);
 
-  auto DL = Call->getModule()->getDataLayout();
+  const auto &DL = Call->getModule()->getDataLayout();
   while (!Worklist.empty()) {
-    Instruction *I = Worklist.pop_back_val();
-    if (Constant *C = ConstantFoldInstruction(I, DL)) {
+    auto *const I = Worklist.pop_back_val();
+    if (auto *const C = ConstantFoldInstruction(I, DL)) {
       ReplaceInstructionWithConst(I, C);
       if (isInstructionTriviallyDead(I))
         I->eraseFromParent();

>From 2e3b147d7bd86c0ca64a38628afec3cc21f278e0 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 9 Apr 2025 00:29:19 +0000
Subject: [PATCH 21/38] format

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 28 ++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 87d996ebf50c0..b9c2f9161d8da 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -78,7 +78,8 @@ class NVVMReflectLegacyPass : public ModulePass {
 
 public:
   static char ID;
-  NVVMReflectLegacyPass(const unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {}
+  NVVMReflectLegacyPass(const unsigned SmVersion)
+      : ModulePass(ID), Impl(SmVersion) {}
   bool runOnModule(Module &M) override;
 };
 } // namespace
@@ -119,12 +120,16 @@ void NVVMReflect::populateReflectMap(Module &M) {
     StringRef OptionRef(Option);
     auto [Name, Val] = OptionRef.split('=');
     if (Name.empty())
-      report_fatal_error(Twine("Empty name in nvvm-reflect-add option '") + Option + "'");
+      report_fatal_error(Twine("Empty name in nvvm-reflect-add option '") +
+                         Option + "'");
     if (Val.empty())
-      report_fatal_error(Twine("Missing value in nvvm-reflect-add option '") + Option + "'");
+      report_fatal_error(Twine("Missing value in nvvm-reflect-add option '") +
+                         Option + "'");
     unsigned ValInt;
     if (!to_integer(Val.trim(), ValInt, 10))
-      report_fatal_error(Twine("integer value expected in nvvm-reflect-add option '") + Option + "'");
+      report_fatal_error(
+          Twine("integer value expected in nvvm-reflect-add option '") +
+          Option + "'");
     ReflectMap[Name] = ValInt;
   }
 }
@@ -132,7 +137,8 @@ void NVVMReflect::populateReflectMap(Module &M) {
 /// Process a reflect function by finding all its calls and replacing them with
 /// appropriate constant values. For __CUDA_FTZ, uses the module flag value.
 /// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0.
-bool NVVMReflect::handleReflectFunction(Module &M, const StringRef ReflectName) {
+bool NVVMReflect::handleReflectFunction(Module &M,
+                                        const StringRef ReflectName) {
   Function *const F = M.getFunction(ReflectName);
   if (!F)
     return false;
@@ -149,7 +155,8 @@ bool NVVMReflect::handleReflectFunction(Module &M, const StringRef ReflectName)
     // the call (i.e. "__CUDA_ARCH")
     auto *const Call = dyn_cast<CallInst>(U);
     if (!Call)
-      report_fatal_error("__nvvm_reflect can only be used in a call instruction");
+      report_fatal_error(
+          "__nvvm_reflect can only be used in a call instruction");
     if (Call->getNumOperands() != 2)
       report_fatal_error("__nvvm_reflect requires exactly one argument");
 
@@ -162,7 +169,8 @@ bool NVVMReflect::handleReflectFunction(Module &M, const StringRef ReflectName)
     if (!ConstantStr)
       report_fatal_error("__nvvm_reflect argument must be a string constant");
     if (!ConstantStr->isCString())
-      report_fatal_error("__nvvm_reflect argument must be a null-terminated string");
+      report_fatal_error(
+          "__nvvm_reflect argument must be a null-terminated string");
 
     const StringRef ReflectArg = ConstantStr->getAsString().drop_back();
     if (ReflectArg.empty())
@@ -186,11 +194,13 @@ bool NVVMReflect::handleReflectFunction(Module &M, const StringRef ReflectName)
   return Changed;
 }
 
-void NVVMReflect::foldReflectCall(CallInst *const Call, Constant *const NewValue) {
+void NVVMReflect::foldReflectCall(CallInst *const Call,
+                                  Constant *const NewValue) {
   SmallVector<Instruction *, 8> Worklist;
   // Replace an instruction with a constant and add all users of the instruction
   // to the worklist
-  auto ReplaceInstructionWithConst = [&](Instruction *const I, Constant *const C) {
+  auto ReplaceInstructionWithConst = [&](Instruction *const I,
+                                         Constant *const C) {
     for (auto *const U : I->users())
       if (auto *const UI = dyn_cast<Instruction>(U))
         Worklist.push_back(UI);

>From 4e473ee7c0d36978ea2e2bc733203f9b806163b5 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 9 Apr 2025 00:31:53 +0000
Subject: [PATCH 22/38] forgot a getOperand call

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index b9c2f9161d8da..6dd683ed91f67 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -165,7 +165,7 @@ bool NVVMReflect::handleReflectFunction(Module &M,
     if (!GlobalStr)
       report_fatal_error("__nvvm_reflect argument must be a constant string");
 
-    const auto *const ConstantStr = dyn_cast<ConstantDataSequential>(GlobalStr);
+    const auto *const ConstantStr = dyn_cast<ConstantDataSequential>(GlobalStr->getOperand(0));
     if (!ConstantStr)
       report_fatal_error("__nvvm_reflect argument must be a string constant");
     if (!ConstantStr->isCString())

>From 35b23e4798ab5718fc67aac12f24d29b3dbe142f Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 9 Apr 2025 00:32:06 +0000
Subject: [PATCH 23/38] format

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 6dd683ed91f67..449e60ba856b9 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -165,7 +165,8 @@ bool NVVMReflect::handleReflectFunction(Module &M,
     if (!GlobalStr)
       report_fatal_error("__nvvm_reflect argument must be a constant string");
 
-    const auto *const ConstantStr = dyn_cast<ConstantDataSequential>(GlobalStr->getOperand(0));
+    const auto *const ConstantStr =
+        dyn_cast<ConstantDataSequential>(GlobalStr->getOperand(0));
     if (!ConstantStr)
       report_fatal_error("__nvvm_reflect argument must be a string constant");
     if (!ConstantStr->isCString())

>From e19929d531237cc5152b2d931dabd09b4a1eb2b2 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 9 Apr 2025 20:20:42 +0000
Subject: [PATCH 24/38] format

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 449e60ba856b9..5f2c042225d1a 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -85,8 +85,6 @@ class NVVMReflectLegacyPass : public ModulePass {
 } // namespace
 
 ModulePass *llvm::createNVVMReflectPass(const unsigned SmVersion) {
-  LLVM_DEBUG(dbgs() << "Creating NVVMReflectPass with SM version " << SmVersion
-                    << "\n");
   return new NVVMReflectLegacyPass(SmVersion);
 }
 

>From 66752e37496ef77c4671c14d75334a35cffdecde Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 9 Apr 2025 20:24:35 +0000
Subject: [PATCH 25/38] newline

---
 llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
index fae48e554383b..61e3800a42fbc 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
@@ -23,4 +23,4 @@ define i32 @options() {
 }
 
 ; CHECK-FTZ1-ARCH350: ret i32 351
-; CHECK-FTZ0-ARCH520: ret i32 520
\ No newline at end of file
+; CHECK-FTZ0-ARCH520: ret i32 520

>From fb106a306e073b1a17a0020d07f384bd3c309509 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 10 Apr 2025 16:48:13 +0000
Subject: [PATCH 26/38] removing const

---
 clang/test/CodeGen/builtins-nvptx.c   | 21 ++++++------
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 47 +++++++++++++--------------
 2 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 71b29849618b6..e2c159aac903f 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1,37 +1,37 @@
 // REQUIRES: nvptx-registered-target
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
 // RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 // ###  The last run to check with the highest SM and PTX version available
 // ###  to make sure target builtins are still accepted.
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
-// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 
 #define __device__ __attribute__((device))
@@ -61,6 +61,7 @@ __device__ bool reflect() {
 
   unsigned x = __nvvm_reflect("__CUDA_ARCH");
   return x >= 700;
+
 }
 
 __device__ int read_ntid() {
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 5f2c042225d1a..ea15215d1a203 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -61,14 +61,14 @@ class NVVMReflect {
   // Map from reflect function call arguments to the value to replace the call
   // with. Should include __CUDA_FTZ and __CUDA_ARCH values.
   StringMap<unsigned> ReflectMap;
-  bool handleReflectFunction(Module &M, const StringRef ReflectName);
+  bool handleReflectFunction(Module &M, StringRef ReflectName);
   void populateReflectMap(Module &M);
-  void foldReflectCall(CallInst *const Call, Constant *const NewValue);
+  void foldReflectCall(CallInst *Call, Constant *NewValue);
 
 public:
   // __CUDA_FTZ is assigned in `runOnModule` by checking nvvm-reflect-ftz module
   // metadata.
-  explicit NVVMReflect(const unsigned SmVersion)
+  explicit NVVMReflect(unsigned SmVersion)
       : ReflectMap({{CUDA_ARCH_NAME, SmVersion * 10}}) {}
   bool runOnModule(Module &M);
 };
@@ -78,13 +78,13 @@ class NVVMReflectLegacyPass : public ModulePass {
 
 public:
   static char ID;
-  NVVMReflectLegacyPass(const unsigned SmVersion)
+  NVVMReflectLegacyPass(unsigned SmVersion)
       : ModulePass(ID), Impl(SmVersion) {}
   bool runOnModule(Module &M) override;
 };
 } // namespace
 
-ModulePass *llvm::createNVVMReflectPass(const unsigned SmVersion) {
+ModulePass *llvm::createNVVMReflectPass(unsigned SmVersion) {
   return new NVVMReflectLegacyPass(SmVersion);
 }
 
@@ -109,11 +109,11 @@ static cl::list<std::string> ReflectList(
 // Set the VarMap with, first, the value of __CUDA_FTZ from module metadata, and
 // then the key/value pairs from the command line.
 void NVVMReflect::populateReflectMap(Module &M) {
-  if (const auto *const Flag = mdconst::extract_or_null<ConstantInt>(
+  if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
           M.getModuleFlag(CUDA_FTZ_MODULE_NAME)))
     ReflectMap[CUDA_FTZ_NAME] = Flag->getSExtValue();
 
-  for (const auto &Option : ReflectList) {
+  for (auto &Option : ReflectList) {
     LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n");
     StringRef OptionRef(Option);
     auto [Name, Val] = OptionRef.split('=');
@@ -135,9 +135,8 @@ void NVVMReflect::populateReflectMap(Module &M) {
 /// Process a reflect function by finding all its calls and replacing them with
 /// appropriate constant values. For __CUDA_FTZ, uses the module flag value.
 /// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0.
-bool NVVMReflect::handleReflectFunction(Module &M,
-                                        const StringRef ReflectName) {
-  Function *const F = M.getFunction(ReflectName);
+bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) {
+  Function *F = M.getFunction(ReflectName);
   if (!F)
     return false;
   assert(F->isDeclaration() && "_reflect function should not have a body");
@@ -145,25 +144,25 @@ bool NVVMReflect::handleReflectFunction(Module &M,
          "_reflect's return type should be integer");
 
   const bool Changed = F->getNumUses() > 0;
-  for (User *const U : make_early_inc_range(F->users())) {
+  for (User *U : make_early_inc_range(F->users())) {
     // Reflect function calls look like:
     // @arch = private unnamed_addr addrspace(1) constant [12 x i8]
     // c"__CUDA_ARCH\00" call i32 @__nvvm_reflect(ptr addrspacecast (ptr
     // addrspace(1) @arch to ptr)) We need to extract the string argument from
     // the call (i.e. "__CUDA_ARCH")
-    auto *const Call = dyn_cast<CallInst>(U);
+    auto *Call = dyn_cast<CallInst>(U);
     if (!Call)
       report_fatal_error(
           "__nvvm_reflect can only be used in a call instruction");
     if (Call->getNumOperands() != 2)
       report_fatal_error("__nvvm_reflect requires exactly one argument");
 
-    const auto *const GlobalStr =
+    auto *GlobalStr =
         dyn_cast<Constant>(Call->getArgOperand(0)->stripPointerCasts());
     if (!GlobalStr)
       report_fatal_error("__nvvm_reflect argument must be a constant string");
 
-    const auto *const ConstantStr =
+    auto *ConstantStr =
         dyn_cast<ConstantDataSequential>(GlobalStr->getOperand(0));
     if (!ConstantStr)
       report_fatal_error("__nvvm_reflect argument must be a string constant");
@@ -171,7 +170,7 @@ bool NVVMReflect::handleReflectFunction(Module &M,
       report_fatal_error(
           "__nvvm_reflect argument must be a null-terminated string");
 
-    const StringRef ReflectArg = ConstantStr->getAsString().drop_back();
+    StringRef ReflectArg = ConstantStr->getAsString().drop_back();
     if (ReflectArg.empty())
       report_fatal_error("__nvvm_reflect argument cannot be empty");
     // Now that we have extracted the string argument, we can look it up in the
@@ -183,7 +182,7 @@ bool NVVMReflect::handleReflectFunction(Module &M,
     LLVM_DEBUG(dbgs() << "Replacing call of reflect function " << F->getName()
                       << "(" << ReflectArg << ") with value " << ReflectVal
                       << "\n");
-    auto *const NewValue = ConstantInt::get(Call->getType(), ReflectVal);
+    auto *NewValue = ConstantInt::get(Call->getType(), ReflectVal);
     foldReflectCall(Call, NewValue);
     Call->eraseFromParent();
   }
@@ -193,25 +192,23 @@ bool NVVMReflect::handleReflectFunction(Module &M,
   return Changed;
 }
 
-void NVVMReflect::foldReflectCall(CallInst *const Call,
-                                  Constant *const NewValue) {
+void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) {
   SmallVector<Instruction *, 8> Worklist;
   // Replace an instruction with a constant and add all users of the instruction
   // to the worklist
-  auto ReplaceInstructionWithConst = [&](Instruction *const I,
-                                         Constant *const C) {
-    for (auto *const U : I->users())
-      if (auto *const UI = dyn_cast<Instruction>(U))
+  auto ReplaceInstructionWithConst = [&](Instruction *I, Constant *C) {
+    for (auto *U : I->users())
+      if (auto *UI = dyn_cast<Instruction>(U))
         Worklist.push_back(UI);
     I->replaceAllUsesWith(C);
   };
 
   ReplaceInstructionWithConst(Call, NewValue);
 
-  const auto &DL = Call->getModule()->getDataLayout();
+  auto &DL = Call->getModule()->getDataLayout();
   while (!Worklist.empty()) {
-    auto *const I = Worklist.pop_back_val();
-    if (auto *const C = ConstantFoldInstruction(I, DL)) {
+    auto *I = Worklist.pop_back_val();
+    if (auto *C = ConstantFoldInstruction(I, DL)) {
       ReplaceInstructionWithConst(I, C);
       if (isInstructionTriviallyDead(I))
         I->eraseFromParent();

>From 7f890ab9c577ce1beb386e94f90259c8e278b776 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 10 Apr 2025 16:48:36 +0000
Subject: [PATCH 27/38] format

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index ea15215d1a203..e6da52eb6392d 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -78,8 +78,7 @@ class NVVMReflectLegacyPass : public ModulePass {
 
 public:
   static char ID;
-  NVVMReflectLegacyPass(unsigned SmVersion)
-      : ModulePass(ID), Impl(SmVersion) {}
+  NVVMReflectLegacyPass(unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {}
   bool runOnModule(Module &M) override;
 };
 } // namespace

>From 1e2db88681766778a56c62c70a6a7671e5e4ad6e Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 10 Apr 2025 20:53:10 +0000
Subject: [PATCH 28/38] updated reflect options test

---
 .../CodeGen/NVPTX/nvvm-reflect-options.ll     | 59 ++++++++++++++-----
 1 file changed, 43 insertions(+), 16 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
index 61e3800a42fbc..4b203b35d2fe4 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
@@ -1,26 +1,53 @@
-; Verify that when passing in command-line options to NVVMReflect, that reflect calls are replaced with
-; the appropriate command line values.
+; Test the NVVM reflect pass functionality: verifying that reflect calls are replaced with 
+; appropriate values based on command-line options. Verify that we can handle custom reflect arguments
+; that aren't __CUDA_ARCH or __CUDA_FTZ. If that argument is given a value on the command-line, the reflect call should be replaced with that value.
+; Otherwise, the reflect call should be replaced with 0.
+
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=1 -nvvm-reflect-add __CUDA_ARCH=350 %s -S | FileCheck %s --check-prefix=CHECK-FTZ1 --check-prefix=CHECK-ARCH350 --check-prefix=CHECK-CUSTOM-ABSENT
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %s -S | FileCheck %s --check-prefix=CHECK-FTZ0 --check-prefix=CHECK-ARCH520 --check-prefix=CHECK-CUSTOM-ABSENT
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 -nvvm-reflect-add __CUSTOM_VALUE=42 %s -S | FileCheck %s --check-prefix=CHECK-CUSTOM-PRESENT
+
+; To ensure that command line options override module options, create a copy of this test file with module options appended and rerun some tests.
+;
+; RUN: cat %s > %t.options
+; RUN: echo '!llvm.module.flags = !{!0}' >> %t.options
+; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.options
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %t.options -S | FileCheck %s --check-prefix=CHECK-FTZ0 --check-prefix=CHECK-ARCH520
 
 declare i32 @__nvvm_reflect(ptr)
 @ftz = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
 @arch = private unnamed_addr addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
+ at custom = private unnamed_addr addrspace(1) constant [15 x i8] c"__CUSTOM_VALUE\00"
 
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=1 -nvvm-reflect-add __CUDA_ARCH=350 %s -S | FileCheck %s --check-prefix=CHECK-FTZ1-ARCH350
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %s -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
+; Test handling of __CUDA_FTZ reflect value
+define i32 @test_ftz() {
+  %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @ftz to ptr))
+  ret i32 %1
+}
 
-; Verify that if we have module metadata that sets __CUDA_FTZ=1, that gets overridden by the command line arguments
+; CHECK-FTZ1: define i32 @test_ftz()
+; CHECK-FTZ1: ret i32 1
+; CHECK-FTZ0: define i32 @test_ftz()
+; CHECK-FTZ0: ret i32 0
 
-; RUN: cat %s > %t.options
-; RUN: echo '!llvm.module.flags = !{!0}' >> %t.options
-; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.options
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %t.options -S | FileCheck %s --check-prefix=CHECK-FTZ0-ARCH520
+; Test handling of __CUDA_ARCH reflect value
+define i32 @test_arch() {
+  %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @arch to ptr))
+  ret i32 %1
+}
 
-define i32 @options() {
-  %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @ftz to ptr))
-  %2 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @arch to ptr))
-  %3 = add i32 %1, %2
-  ret i32 %3
+; CHECK-ARCH350: define i32 @test_arch()
+; CHECK-ARCH350: ret i32 350
+; CHECK-ARCH520: define i32 @test_arch()
+; CHECK-ARCH520: ret i32 520
+
+; Test handling of a custom reflect value that's not built into the pass
+define i32 @test_custom() {
+  %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @custom to ptr))
+  ret i32 %1
 }
 
-; CHECK-FTZ1-ARCH350: ret i32 351
-; CHECK-FTZ0-ARCH520: ret i32 520
+; CHECK-CUSTOM-ABSENT: define i32 @test_custom()
+; CHECK-CUSTOM-ABSENT: ret i32 0
+; CHECK-CUSTOM-PRESENT: define i32 @test_custom()
+; CHECK-CUSTOM-PRESENT: ret i32 42

>From 972aa89a9817b02c6f4c5d01caf300634a06d514 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 10 Apr 2025 20:53:39 +0000
Subject: [PATCH 29/38] newline

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index e6da52eb6392d..9f867e7d4ea4a 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -236,4 +236,4 @@ bool NVVMReflectLegacyPass::runOnModule(Module &M) {
 PreservedAnalyses NVVMReflectPass::run(Module &M, ModuleAnalysisManager &AM) {
   return NVVMReflect(SmVersion).runOnModule(M) ? PreservedAnalyses::none()
                                                : PreservedAnalyses::all();
-}
\ No newline at end of file
+}

>From 3e0c8b329cca75fae1d874238378e5a28b2b3607 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor at nvidia.com>
Date: Thu, 10 Apr 2025 14:15:10 -0700
Subject: [PATCH 30/38] [CIR] Upstream support for cir.get_global (#135095)

This adds basic support for referencing global variables from within
functions via the cir.get_global operation.
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 31 ++++++
 clang/include/clang/CIR/MissingFeatures.h     |  5 +
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 41 +++++++-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 97 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenModule.h          | 25 +++++
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       | 35 +++++++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 21 ++++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   | 10 ++
 clang/test/CIR/CodeGen/basic.c                | 25 +++++
 9 files changed, 288 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index c6b509fe50cbc..0d3c2065cd58c 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1279,6 +1279,37 @@ def GlobalOp : CIR_Op<"global"> {
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// GetGlobalOp
+//===----------------------------------------------------------------------===//
+
+def GetGlobalOp : CIR_Op<"get_global",
+    [Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let summary = "Get the address of a global variable";
+  let description = [{
+    The `cir.get_global` operation retrieves the address pointing to a
+    named global variable. If the global variable is marked constant, writing
+    to the resulting address (such as through a `cir.store` operation) is
+    undefined. The resulting type must always be a `!cir.ptr<...>` type with the
+    same address space as the global variable.
+
+    Example:
+    ```mlir
+    %x = cir.get_global @gv : !cir.ptr<i32>
+    ```
+  }];
+
+  let arguments = (ins FlatSymbolRefAttr:$name);
+  let results = (outs Res<CIR_PointerType, "", []>:$addr);
+
+  let assemblyFormat = [{
+    $name `:` qualified(type($addr)) attr-dict
+  }];
+
+  // `GetGlobalOp` is fully verified by its traits.
+  let hasVerifier = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // FuncOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 1b61c32cf4f38..bacb7879a5527 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -35,6 +35,7 @@ struct MissingFeatures {
   static bool opGlobalThreadLocal() { return false; }
   static bool opGlobalConstant() { return false; }
   static bool opGlobalAlignment() { return false; }
+  static bool opGlobalWeakRef() { return false; }
 
   static bool supportIFuncAttr() { return false; }
   static bool supportVisibility() { return false; }
@@ -136,6 +137,10 @@ struct MissingFeatures {
   static bool objCGC() { return false; }
   static bool weakRefReference() { return false; }
   static bool hip() { return false; }
+  static bool setObjCGCLValueClass() { return false; }
+  static bool mangledNames() { return false; }
+  static bool setDLLStorageClass() { return false; }
+  static bool openMP() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index bb5588c0ff525..4c20170d75131 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -183,6 +183,43 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
   emitStoreOfScalar(src.getScalarVal(), dst, isInit);
 }
 
+static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
+                                      const VarDecl *vd) {
+  QualType T = e->getType();
+
+  // If it's thread_local, emit a call to its wrapper function instead.
+  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+  if (vd->getTLSKind() == VarDecl::TLS_Dynamic)
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "emitGlobalVarDeclLValue: thread_local variable");
+
+  // Check if the variable is marked as declare target with link clause in
+  // device codegen.
+  if (cgf.getLangOpts().OpenMP)
+    cgf.cgm.errorNYI(e->getSourceRange(), "emitGlobalVarDeclLValue: OpenMP");
+
+  // Traditional LLVM codegen handles thread local separately, CIR handles
+  // as part of getAddrOfGlobalVar.
+  mlir::Value v = cgf.cgm.getAddrOfGlobalVar(vd);
+
+  assert(!cir::MissingFeatures::addressSpace());
+  mlir::Type realVarTy = cgf.convertTypeForMem(vd->getType());
+  cir::PointerType realPtrTy = cgf.getBuilder().getPointerTo(realVarTy);
+  if (realPtrTy != v.getType())
+    v = cgf.getBuilder().createBitcast(v.getLoc(), v, realPtrTy);
+
+  CharUnits alignment = cgf.getContext().getDeclAlign(vd);
+  Address addr(v, realVarTy, alignment);
+  LValue lv;
+  if (vd->getType()->isReferenceType())
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "emitGlobalVarDeclLValue: reference type");
+  else
+    lv = cgf.makeAddrLValue(addr, T, AlignmentSource::Decl);
+  assert(!cir::MissingFeatures::setObjCGCLValueClass());
+  return lv;
+}
+
 void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isVolatile, QualType ty,
                                        bool isInit, bool isNontemporal) {
@@ -288,7 +325,7 @@ LValue CIRGenFunction::emitDeclRefLValue(const DeclRefExpr *e) {
 
     // Check if this is a global variable
     if (vd->hasLinkage() || vd->isStaticDataMember())
-      cgm.errorNYI(vd->getSourceRange(), "emitDeclRefLValue: global variable");
+      return emitGlobalVarDeclLValue(*this, e, vd);
 
     Address addr = Address::invalid();
 
@@ -299,7 +336,7 @@ LValue CIRGenFunction::emitDeclRefLValue(const DeclRefExpr *e) {
     } else {
       // Otherwise, it might be static local we haven't emitted yet for some
       // reason; most likely, because it's in an outer function.
-      cgm.errorNYI(vd->getSourceRange(), "emitDeclRefLValue: static local");
+      cgm.errorNYI(e->getSourceRange(), "emitDeclRefLValue: static local");
     }
 
     return makeAddrLValue(addr, ty, AlignmentSource::Type);
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 2f90141b2d158..4b5acb36a9319 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -202,6 +202,102 @@ void CIRGenModule::emitGlobalFunctionDefinition(clang::GlobalDecl gd,
   curCGF = nullptr;
 }
 
+mlir::Operation *CIRGenModule::getGlobalValue(StringRef name) {
+  return mlir::SymbolTable::lookupSymbolIn(theModule, name);
+}
+
+/// If the specified mangled name is not in the module,
+/// create and return an mlir GlobalOp with the specified type (TODO(cir):
+/// address space).
+///
+/// TODO(cir):
+/// 1. If there is something in the module with the specified name, return
+/// it potentially bitcasted to the right type.
+///
+/// 2. If \p d is non-null, it specifies a decl that correspond to this.  This
+/// is used to set the attributes on the global when it is first created.
+///
+/// 3. If \p isForDefinition is true, it is guaranteed that an actual global
+/// with type \p ty will be returned, not conversion of a variable with the same
+/// mangled name but some other type.
+cir::GlobalOp
+CIRGenModule::getOrCreateCIRGlobal(StringRef mangledName, mlir::Type ty,
+                                   LangAS langAS, const VarDecl *d,
+                                   ForDefinition_t isForDefinition) {
+  // Lookup the entry, lazily creating it if necessary.
+  cir::GlobalOp entry;
+  if (mlir::Operation *v = getGlobalValue(mangledName)) {
+    if (!isa<cir::GlobalOp>(v))
+      errorNYI(d->getSourceRange(), "global with non-GlobalOp type");
+    entry = cast<cir::GlobalOp>(v);
+  }
+
+  if (entry) {
+    assert(!cir::MissingFeatures::addressSpace());
+    assert(!cir::MissingFeatures::opGlobalWeakRef());
+
+    assert(!cir::MissingFeatures::setDLLStorageClass());
+    assert(!cir::MissingFeatures::openMP());
+
+    if (entry.getSymType() == ty)
+      return entry;
+
+    // If there are two attempts to define the same mangled name, issue an
+    // error.
+    //
+    // TODO(cir): look at mlir::GlobalValue::isDeclaration for all aspects of
+    // recognizing the global as a declaration, for now only check if
+    // initializer is present.
+    if (isForDefinition && !entry.isDeclaration()) {
+      errorNYI(d->getSourceRange(), "global with conflicting type");
+    }
+
+    // Address space check removed because it is unnecessary because CIR records
+    // address space info in types.
+
+    // (If global is requested for a definition, we always need to create a new
+    // global, not just return a bitcast.)
+    if (!isForDefinition)
+      return entry;
+  }
+
+  errorNYI(d->getSourceRange(), "reference of undeclared global");
+}
+
+cir::GlobalOp
+CIRGenModule::getOrCreateCIRGlobal(const VarDecl *d, mlir::Type ty,
+                                   ForDefinition_t isForDefinition) {
+  assert(d->hasGlobalStorage() && "Not a global variable");
+  QualType astTy = d->getType();
+  if (!ty)
+    ty = getTypes().convertTypeForMem(astTy);
+
+  assert(!cir::MissingFeatures::mangledNames());
+  return getOrCreateCIRGlobal(d->getIdentifier()->getName(), ty,
+                              astTy.getAddressSpace(), d, isForDefinition);
+}
+
+/// Return the mlir::Value for the address of the given global variable. If
+/// \p ty is non-null and if the global doesn't exist, then it will be created
+/// with the specified type instead of whatever the normal requested type would
+/// be. If \p isForDefinition is true, it is guaranteed that an actual global
+/// with type \p ty will be returned, not conversion of a variable with the same
+/// mangled name but some other type.
+mlir::Value CIRGenModule::getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty,
+                                             ForDefinition_t isForDefinition) {
+  assert(d->hasGlobalStorage() && "Not a global variable");
+  QualType astTy = d->getType();
+  if (!ty)
+    ty = getTypes().convertTypeForMem(astTy);
+
+  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+
+  cir::GlobalOp g = getOrCreateCIRGlobal(d, ty, isForDefinition);
+  mlir::Type ptrTy = builder.getPointerTo(g.getSymType());
+  return builder.create<cir::GetGlobalOp>(getLoc(d->getSourceRange()), ptrTy,
+                                          g.getSymName());
+}
+
 void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
                                            bool isTentative) {
   const QualType astTy = vd->getType();
@@ -507,6 +603,7 @@ cir::FuncOp CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd,
     funcType = convertType(fd->getType());
   }
 
+  assert(!cir::MissingFeatures::mangledNames());
   cir::FuncOp func = getOrCreateCIRFunction(
       cast<NamedDecl>(gd.getDecl())->getIdentifier()->getName(), funcType, gd,
       forVTable, dontDefer, /*isThunk=*/false, isForDefinition);
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 174094bafabad..764ad1d7592aa 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -91,6 +91,31 @@ class CIRGenModule : public CIRGenTypeCache {
   const clang::LangOptions &getLangOpts() const { return langOpts; }
   mlir::MLIRContext &getMLIRContext() { return *builder.getContext(); }
 
+  /// -------
+  /// Handling globals
+  /// -------
+
+  mlir::Operation *getGlobalValue(llvm::StringRef ref);
+
+  /// If the specified mangled name is not in the module, create and return an
+  /// mlir::GlobalOp value
+  cir::GlobalOp getOrCreateCIRGlobal(llvm::StringRef mangledName, mlir::Type ty,
+                                     LangAS langAS, const VarDecl *d,
+                                     ForDefinition_t isForDefinition);
+
+  cir::GlobalOp getOrCreateCIRGlobal(const VarDecl *d, mlir::Type ty,
+                                     ForDefinition_t isForDefinition);
+
+  /// Return the mlir::Value for the address of the given global variable.
+  /// If Ty is non-null and if the global doesn't exist, then it will be created
+  /// with the specified type instead of whatever the normal requested type
+  /// would be. If IsForDefinition is true, it is guaranteed that an actual
+  /// global with type Ty will be returned, not conversion of a variable with
+  /// the same mangled name but some other type.
+  mlir::Value
+  getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty = {},
+                     ForDefinition_t isForDefinition = NotForDefinition);
+
   /// Helpers to convert the presumed location of Clang's SourceLocation to an
   /// MLIR Location.
   mlir::Location getLoc(clang::SourceLocation cLoc);
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index c798c14508f56..f3e5e572653da 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -848,6 +848,41 @@ parseGlobalOpTypeAndInitialValue(OpAsmParser &parser, TypeAttr &typeAttr,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// GetGlobalOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+cir::GetGlobalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  // Verify that the result type underlying pointer type matches the type of
+  // the referenced cir.global or cir.func op.
+  mlir::Operation *op =
+      symbolTable.lookupNearestSymbolFrom(*this, getNameAttr());
+  if (op == nullptr || !(isa<GlobalOp>(op) || isa<FuncOp>(op)))
+    return emitOpError("'")
+           << getName()
+           << "' does not reference a valid cir.global or cir.func";
+
+  mlir::Type symTy;
+  if (auto g = dyn_cast<GlobalOp>(op)) {
+    symTy = g.getSymType();
+    assert(!cir::MissingFeatures::addressSpace());
+    assert(!cir::MissingFeatures::opGlobalThreadLocal());
+  } else if (auto f = dyn_cast<FuncOp>(op)) {
+    symTy = f.getFunctionType();
+  } else {
+    llvm_unreachable("Unexpected operation for GetGlobalOp");
+  }
+
+  auto resultType = dyn_cast<PointerType>(getAddr().getType());
+  if (!resultType || symTy != resultType.getPointee())
+    return emitOpError("result type pointee type '")
+           << resultType.getPointee() << "' does not match type " << symTy
+           << " of the global @" << getName();
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FuncOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 7ca36409c9cac..7159f89c93a53 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -828,6 +828,26 @@ mlir::LogicalResult CIRToLLVMFuncOpLowering::matchAndRewrite(
   return mlir::LogicalResult::success();
 }
 
+mlir::LogicalResult CIRToLLVMGetGlobalOpLowering::matchAndRewrite(
+    cir::GetGlobalOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  // FIXME(cir): Premature DCE to avoid lowering stuff we're not using.
+  // CIRGen should mitigate this and not emit the get_global.
+  if (op->getUses().empty()) {
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+
+  mlir::Type type = getTypeConverter()->convertType(op.getType());
+  mlir::Operation *newop =
+      rewriter.create<mlir::LLVM::AddressOfOp>(op.getLoc(), type, op.getName());
+
+  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+
+  rewriter.replaceOp(op, newop);
+  return mlir::success();
+}
+
 /// Replace CIR global with a region initialized LLVM global and update
 /// insertion point to the end of the initializer block.
 void CIRToLLVMGlobalOpLowering::setupRegionInitializedLLVMGlobalOp(
@@ -1418,6 +1438,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMCmpOpLowering,
                CIRToLLVMConstantOpLowering,
                CIRToLLVMFuncOpLowering,
+               CIRToLLVMGetGlobalOpLowering,
                CIRToLLVMTrapOpLowering,
                CIRToLLVMUnaryOpLowering
       // clang-format on
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index d53c4b31682bb..1de6c9c56b485 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -140,6 +140,16 @@ class CIRToLLVMFuncOpLowering : public mlir::OpConversionPattern<cir::FuncOp> {
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMGetGlobalOpLowering
+    : public mlir::OpConversionPattern<cir::GetGlobalOp> {
+public:
+  using mlir::OpConversionPattern<cir::GetGlobalOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::GetGlobalOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMGlobalOpLowering
     : public mlir::OpConversionPattern<cir::GlobalOp> {
   const mlir::DataLayout &dataLayout;
diff --git a/clang/test/CIR/CodeGen/basic.c b/clang/test/CIR/CodeGen/basic.c
index 7184e395ce386..cf687e44044c4 100644
--- a/clang/test/CIR/CodeGen/basic.c
+++ b/clang/test/CIR/CodeGen/basic.c
@@ -144,3 +144,28 @@ void f5(void) {
 // OGCG:   br label %[[LOOP:.*]]
 // OGCG: [[LOOP]]:
 // OGCG:   br label %[[LOOP]]
+
+int gv;
+int f6(void) {
+  return gv;
+}
+
+//      CIR: cir.func @f6() -> !s32i
+// CIR-NEXT:   %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
+// CIR-NEXT:   %[[GV_PTR:.*]] = cir.get_global @gv : !cir.ptr<!s32i>
+// CIR-NEXT:   %[[GV:.*]] = cir.load %[[GV_PTR]] : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:   cir.store %[[GV]], %[[RV]] : !s32i, !cir.ptr<!s32i>
+// CIR-NEXT:   %[[R:.*]] = cir.load %[[RV]] : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:   cir.return %[[R]] : !s32i
+
+// LLVM:      define i32 @f6()
+// LLVM-NEXT:   %[[RV_PTR:.*]] = alloca i32, i64 1, align 4
+// LLVM-NEXT:   %[[GV:.*]] = load i32, ptr @gv, align 4
+// LLVM-NEXT:   store i32 %[[GV]], ptr %[[RV_PTR]], align 4
+// LLVM-NEXT:   %[[RV:.*]] = load i32, ptr %[[RV_PTR]], align 4
+// LLVM-NEXT:   ret i32 %[[RV]]
+
+// OGCG:      define{{.*}} i32 @f6()
+// OGCG-NEXT: entry:
+// OGCG-NEXT:   %[[GV:.*]] = load i32, ptr @gv, align 4
+// OGCG-NEXT:   ret i32 %[[GV]]

>From d401065142c00a24a253ad48c2bbe4e716e7a528 Mon Sep 17 00:00:00 2001
From: jeremyd2019 <github at jdrake.com>
Date: Thu, 10 Apr 2025 14:20:10 -0700
Subject: [PATCH 31/38] [LLD] [MinGW] Fall back to using default target if no
 -m flag given. (#134700)

On Cygwin at least, GCC is not passing any -m flag to the linker, so
fall back to the default target triple to determine if we need to apply
i386-specific behaviors.

Fixes #134558
---
 lld/MinGW/Driver.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index a77d86b443a6c..222ee3a93ef48 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -171,6 +171,14 @@ searchLibrary(StringRef name, ArrayRef<StringRef> searchPaths, bool bStatic) {
   return "";
 }
 
+static bool isI386Target(const opt::InputArgList &args,
+                         const Triple &defaultTarget) {
+  auto *a = args.getLastArg(OPT_m);
+  if (a)
+    return StringRef(a->getValue()) == "i386pe";
+  return defaultTarget.getArch() == Triple::x86;
+}
+
 namespace lld {
 namespace coff {
 bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
@@ -216,6 +224,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     return false;
   }
 
+  Triple defaultTarget(Triple::normalize(sys::getDefaultTargetTriple()));
+
   std::vector<std::string> linkArgs;
   auto add = [&](const Twine &s) { linkArgs.push_back(s.str()); };
 
@@ -224,7 +234,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
 
   if (auto *a = args.getLastArg(OPT_entry)) {
     StringRef s = a->getValue();
-    if (args.getLastArgValue(OPT_m) == "i386pe" && s.starts_with("_"))
+    if (isI386Target(args, defaultTarget) && s.starts_with("_"))
       add("-entry:" + s.substr(1));
     else if (!s.empty())
       add("-entry:" + s);
@@ -521,7 +531,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   for (auto *a : args.filtered(OPT_Xlink))
     add(a->getValue());
 
-  if (args.getLastArgValue(OPT_m) == "i386pe")
+  if (isI386Target(args, defaultTarget))
     add("-alternatename:__image_base__=___ImageBase");
   else
     add("-alternatename:__image_base__=__ImageBase");

>From b16a10456d8c564df2034a1a324ced6857a79f86 Mon Sep 17 00:00:00 2001
From: mojyack <66899529+mojyack at users.noreply.github.com>
Date: Fri, 11 Apr 2025 06:23:26 +0900
Subject: [PATCH 32/38] [sanitizer_common] Fix build on ppc64+musl (#120036)

In powerpc64-unknown-linux-musl, signal.h does not include asm/ptrace.h,
which causes "member access into incomplete type 'struct pt_regs'"
errors. Include the header explicitly to fix this.

Also in sanitizer_linux_libcdep.cpp, there is a usage of TlsPreTcbSize
which is not defined in such a platform. Guard the branch with macro.
---
 .../lib/sanitizer_common/sanitizer_linux.cpp        |  4 ++++
 .../sanitizer_common/sanitizer_linux_libcdep.cpp    | 13 +++++++------
 .../sanitizer_platform_limits_posix.cpp             |  2 +-
 .../sanitizer_stoptheworld_linux_libcdep.cpp        |  3 ++-
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 1fa75214f4205..19d414b6b3657 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -92,6 +92,10 @@ extern "C" SANITIZER_WEAK_ATTRIBUTE const char *strerrorname_np(int);
 #    include <sys/sysmacros.h>
 #  endif
 
+#  if SANITIZER_LINUX && defined(__powerpc64__)
+#    include <asm/ptrace.h>
+#  endif
+
 #  if SANITIZER_FREEBSD
 #    include <machine/atomic.h>
 #    include <sys/exec.h>
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index 9cc9da3d88c40..16eb8dec63e92 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -611,21 +611,22 @@ static void GetTls(uptr *addr, uptr *size) {
   *addr = tp - RoundUpTo(*size, align);
   *size = tp - *addr + ThreadDescriptorSize();
 #      else
-  if (SANITIZER_GLIBC)
-    *size += 1664;
-  else if (SANITIZER_FREEBSD)
-    *size += 128;  // RTLD_STATIC_TLS_EXTRA
-#        if defined(__mips__) || defined(__powerpc64__) || SANITIZER_RISCV64
+#        if SANITIZER_GLIBC
+  *size += 1664;
+#        elif SANITIZER_FREEBSD
+  *size += 128;  // RTLD_STATIC_TLS_EXTRA
+#          if defined(__mips__) || defined(__powerpc64__) || SANITIZER_RISCV64
   const uptr pre_tcb_size = TlsPreTcbSize();
   *addr -= pre_tcb_size;
   *size += pre_tcb_size;
-#        else
+#          else
   // arm and aarch64 reserve two words at TP, so this underestimates the range.
   // However, this is sufficient for the purpose of finding the pointers to
   // thread-specific data keys.
   const uptr tcb_size = ThreadDescriptorSize();
   *addr -= tcb_size;
   *size += tcb_size;
+#          endif
 #        endif
 #      endif
 #    elif SANITIZER_NETBSD
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index a5311d266b0c4..ec5f2edab6a64 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -96,7 +96,7 @@
 # include <sys/ptrace.h>
 #    if defined(__mips64) || defined(__aarch64__) || defined(__arm__) ||       \
         defined(__hexagon__) || defined(__loongarch__) || SANITIZER_RISCV64 || \
-        defined(__sparc__)
+        defined(__sparc__) || defined(__powerpc64__)
 #      include <asm/ptrace.h>
 #      ifdef __arm__
 typedef struct user_fpregs elf_fpregset_t;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
index b05c67e354418..24929b8c4bd7c 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
@@ -31,7 +31,8 @@
 #include <sys/types.h> // for pid_t
 #include <sys/uio.h> // for iovec
 #include <elf.h> // for NT_PRSTATUS
-#if (defined(__aarch64__) || SANITIZER_RISCV64 || SANITIZER_LOONGARCH64) && \
+#if (defined(__aarch64__) || defined(__powerpc64__) || \
+     SANITIZER_RISCV64 || SANITIZER_LOONGARCH64) &&    \
      !SANITIZER_ANDROID
 // GLIBC 2.20+ sys/user does not include asm/ptrace.h
 # include <asm/ptrace.h>

>From 1f7cca7e3d3c41eec981a8ddb96ef73576b45ce9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 10 Apr 2025 22:30:40 +0100
Subject: [PATCH 33/38] [VPlan] Introduce VPInstructionWithType, use instead of
 VPScalarCast(NFC) (#129706)

There are some opcodes that currently require specialized recipes, due
to their result type not being implied by their operands, including
casts.

This leads to duplication from defining multiple full recipes.

This patch introduces a new VPInstructionWithType subclass that also
stores the result type. The general idea is to have opcodes needing to
specify a result type to use this general recipe. The current patch
replaces VPScalarCastRecipe with VInstructionWithType, a similar patch
for VPWidenCastRecipe will follow soon.

There are a few proposed opcodes that should also benefit, without the
need of workarounds:
* https://github.com/llvm/llvm-project/pull/129508
* https://github.com/llvm/llvm-project/pull/119284

PR: https://github.com/llvm/llvm-project/pull/129706
---
 .../Vectorize/LoopVectorizationPlanner.h      |   6 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |   3 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         | 102 +++++++++---------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   8 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  72 +++++++------
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |   6 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    |   3 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 -
 .../Transforms/Vectorize/VPlanVerifier.cpp    |   4 +-
 .../RISCV/vplan-vp-call-intrinsics.ll         |  18 ++--
 .../RISCV/vplan-vp-cast-intrinsics.ll         |  20 ++--
 ...an-vp-intrinsics-fixed-order-recurrence.ll |   4 +-
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |   4 +-
 .../RISCV/vplan-vp-intrinsics.ll              |   2 +-
 .../RISCV/vplan-vp-select-intrinsics.ll       |   2 +-
 .../interleave-and-scalarize-only.ll          |   4 +-
 16 files changed, 132 insertions(+), 127 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 12d615d9adbcc..f80379b980bec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -249,10 +249,10 @@ class VPBuilder {
         new VPDerivedIVRecipe(Kind, FPBinOp, Start, Current, Step, Name));
   }
 
-  VPScalarCastRecipe *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
-                                       Type *ResultTy, DebugLoc DL) {
+  VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
+                                  Type *ResultTy, DebugLoc DL) {
     return tryInsertInstruction(
-        new VPScalarCastRecipe(Opcode, Op, ResultTy, DL));
+        new VPInstructionWithType(Opcode, Op, ResultTy, DL));
   }
 
   VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 141a4fd83c833..a4d546f698d5f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4462,7 +4462,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       switch (R.getVPDefID()) {
       case VPDef::VPDerivedIVSC:
       case VPDef::VPScalarIVStepsSC:
-      case VPDef::VPScalarCastSC:
       case VPDef::VPReplicateSC:
       case VPDef::VPInstructionSC:
       case VPDef::VPCanonicalIVPHISC:
@@ -10679,8 +10678,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
       assert(all_of(IV->users(),
                     [](const VPUser *U) {
                       return isa<VPScalarIVStepsRecipe>(U) ||
-                             isa<VPScalarCastRecipe>(U) ||
                              isa<VPDerivedIVRecipe>(U) ||
+                             cast<VPRecipeBase>(U)->isScalarCast() ||
                              cast<VPInstruction>(U)->getOpcode() ==
                                  Instruction::Add;
                     }) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 669e8f371d3d3..7cdcb24e9760b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -457,6 +457,9 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// Returns the debug location of the recipe.
   DebugLoc getDebugLoc() const { return DL; }
 
+  /// Return true if the recipe is a scalar cast.
+  bool isScalarCast() const;
+
 protected:
   /// Compute the cost of this recipe either using a recipe's specialized
   /// implementation or using the legacy cost model and the underlying
@@ -531,7 +534,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenIntOrFpInductionSC:
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
-    case VPRecipeBase::VPScalarCastSC:
     case VPRecipeBase::VPPartialReductionSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
@@ -1025,6 +1027,56 @@ class VPInstruction : public VPRecipeWithIRFlags,
   StringRef getName() const { return Name; }
 };
 
+/// A specialization of VPInstruction augmenting it with a dedicated result
+/// type, to be used when the opcode and operands of the VPInstruction don't
+/// directly determine the result type. Note that there is no separate VPDef ID
+/// for VPInstructionWithType; it shares the same ID as VPInstruction and is
+/// distinguished purely by the opcode.
+class VPInstructionWithType : public VPInstruction {
+  /// Scalar result type produced by the recipe.
+  Type *ResultTy;
+
+public:
+  VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                        Type *ResultTy, DebugLoc DL, const Twine &Name = "")
+      : VPInstruction(Opcode, Operands, DL, Name), ResultTy(ResultTy) {}
+
+  static inline bool classof(const VPRecipeBase *R) {
+    // VPInstructionWithType are VPInstructions with specific opcodes requiring
+    // type information.
+    return R->isScalarCast();
+  }
+
+  static inline bool classof(const VPUser *R) {
+    return isa<VPInstructionWithType>(cast<VPRecipeBase>(R));
+  }
+
+  VPInstruction *clone() override {
+    SmallVector<VPValue *, 2> Operands(operands());
+    auto *New = new VPInstructionWithType(
+        getOpcode(), Operands, getResultType(), getDebugLoc(), getName());
+    New->setUnderlyingValue(getUnderlyingValue());
+    return New;
+  }
+
+  void execute(VPTransformState &State) override;
+
+  /// Return the cost of this VPInstruction.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override {
+    // TODO: Compute accurate cost after retiring the legacy cost model.
+    return 0;
+  }
+
+  Type *getResultType() const { return ResultTy; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe to wrap on original IR instruction not to be modified during
 /// execution, except for PHIs. PHIs are modeled via the VPIRPhi subclass.
 /// Expect PHIs, VPIRInstructions cannot have any operands.
@@ -1211,54 +1263,6 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
   Type *getResultType() const { return ResultTy; }
 };
 
-/// VPScalarCastRecipe is a recipe to create scalar cast instructions.
-class VPScalarCastRecipe : public VPSingleDefRecipe {
-  Instruction::CastOps Opcode;
-
-  Type *ResultTy;
-
-  Value *generate(VPTransformState &State);
-
-public:
-  VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                     DebugLoc DL)
-      : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}, DL), Opcode(Opcode),
-        ResultTy(ResultTy) {}
-
-  ~VPScalarCastRecipe() override = default;
-
-  VPScalarCastRecipe *clone() override {
-    return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy,
-                                  getDebugLoc());
-  }
-
-  VP_CLASSOF_IMPL(VPDef::VPScalarCastSC)
-
-  void execute(VPTransformState &State) override;
-
-  /// Return the cost of this VPScalarCastRecipe.
-  InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override {
-    // TODO: Compute accurate cost after retiring the legacy cost model.
-    return 0;
-  }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
-  /// Returns the result type of the cast.
-  Type *getResultType() const { return ResultTy; }
-
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
-    // At the moment, only uniform codegen is implemented.
-    assert(is_contained(operands(), Op) &&
-           "Op must be an operand of the recipe");
-    return true;
-  }
-};
-
 /// A recipe for widening vector intrinsics.
 class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
   /// ID of the vector intrinsic to widen.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 24a166bd336d1..142a7080ec761 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -261,20 +261,18 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
                 VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
             return inferScalarType(R->getOperand(0));
           })
+          // VPInstructionWithType must be handled before VPInstruction.
+          .Case<VPInstructionWithType, VPWidenIntrinsicRecipe>(
+              [](const auto *R) { return R->getResultType(); })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
                 VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
-          .Case<VPWidenIntrinsicRecipe>([](const VPWidenIntrinsicRecipe *R) {
-            return R->getResultType();
-          })
           .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
             // TODO: Use info from interleave group.
             return V->getUnderlyingValue()->getType();
           })
           .Case<VPWidenCastRecipe>(
               [](const VPWidenCastRecipe *R) { return R->getResultType(); })
-          .Case<VPScalarCastRecipe>(
-              [](const VPScalarCastRecipe *R) { return R->getResultType(); })
           .Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
             return R->getSCEV()->getType();
           })
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 35e141ed01267..7e4800a324e38 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -142,7 +142,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   switch (getVPDefID()) {
   case VPDerivedIVSC:
   case VPPredInstPHISC:
-  case VPScalarCastSC:
   case VPVectorEndPointerSC:
     return false;
   case VPInstructionSC:
@@ -278,6 +277,11 @@ bool VPRecipeBase::isPhi() const {
           cast<VPInstruction>(this)->getOpcode() == Instruction::PHI);
 }
 
+bool VPRecipeBase::isScalarCast() const {
+  auto *VPI = dyn_cast<VPInstruction>(this);
+  return VPI && Instruction::isCast(VPI->getOpcode());
+}
+
 InstructionCost
 VPPartialReductionRecipe::computeCost(ElementCount VF,
                                       VPCostContext &Ctx) const {
@@ -417,7 +421,7 @@ bool VPInstruction::doesGeneratePerAllLanes() const {
 }
 
 bool VPInstruction::canGenerateScalarForFirstLane() const {
-  if (Instruction::isBinaryOp(getOpcode()))
+  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return true;
   if (isSingleScalar() || isVectorToScalar())
     return true;
@@ -908,7 +912,7 @@ void VPInstruction::execute(VPTransformState &State) {
 }
 
 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
-  if (Instruction::isBinaryOp(getOpcode()))
+  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return false;
   switch (getOpcode()) {
   case Instruction::ExtractElement:
@@ -932,7 +936,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
 
 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
-  if (Instruction::isBinaryOp(getOpcode()))
+  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return vputils::onlyFirstLaneUsed(this);
 
   switch (getOpcode()) {
@@ -1070,6 +1074,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPInstructionWithType::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  assert(vputils::onlyFirstLaneUsed(this) &&
+         "Codegen only implemented for first lane.");
+  switch (getOpcode()) {
+  case Instruction::ZExt:
+  case Instruction::Trunc: {
+    Value *Op = State.get(getOperand(0), VPLane(0));
+    Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
+                                           Op, ResultTy);
+    State.set(this, Cast, VPLane(0));
+    break;
+  }
+  default:
+    llvm_unreachable("opcode not implemented yet");
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
+  printOperands(O, SlotTracker);
+  O << " to " << *ResultTy;
+}
+#endif
+
 VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
   if (auto *Phi = dyn_cast<PHINode>(&I))
     return new VPIRPhi(*Phi);
@@ -2551,37 +2584,6 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
-  assert(vputils::onlyFirstLaneUsed(this) &&
-         "Codegen only implemented for first lane.");
-  switch (Opcode) {
-  case Instruction::SExt:
-  case Instruction::ZExt:
-  case Instruction::Trunc: {
-    // Note: SExt/ZExt not used yet.
-    Value *Op = State.get(getOperand(0), VPLane(0));
-    return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
-  }
-  default:
-    llvm_unreachable("opcode not implemented yet");
-  }
-}
-
-void VPScalarCastRecipe ::execute(VPTransformState &State) {
-  State.set(this, generate(State), VPLane(0));
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
-                                VPSlotTracker &SlotTracker) const {
-  O << Indent << "SCALAR-CAST ";
-  printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(Opcode) << " ";
-  printOperands(O, SlotTracker);
-  O << " to " << *ResultTy;
-}
-#endif
-
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   assert(State.Lane && "Branch on Mask works only on single instance.");
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 475f35f5c40d3..2db4957409c8d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -111,7 +111,11 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
                (isa<LoadInst, StoreInst>(R->getUnderlyingValue())) &&
                all_of(R->operands(), isUniformAcrossVFsAndUFs);
       })
-      .Case<VPScalarCastRecipe, VPWidenCastRecipe>([](const auto *R) {
+      .Case<VPInstruction>([](const auto *VPI) {
+        return VPI->isScalarCast() &&
+               isUniformAcrossVFsAndUFs(VPI->getOperand(0));
+      })
+      .Case<VPWidenCastRecipe>([](const auto *R) {
         // A cast is uniform according to its operand.
         return isUniformAcrossVFsAndUFs(R->getOperand(0));
       })
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 87c5797d9e452..51c53196a1141 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -45,8 +45,7 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
     return true;
   if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV))
     return Rep->isUniform();
-  if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPScalarCastRecipe,
-          VPBlendRecipe>(VPV))
+  if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe>(VPV))
     return all_of(VPV->getDefiningRecipe()->operands(),
                   isUniformAfterVectorization);
   if (auto *VPI = dyn_cast<VPInstruction>(VPV))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index d322fdfa727e4..cbae37d48cd19 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,7 +335,6 @@ class VPDef {
     VPReductionSC,
     VPPartialReductionSC,
     VPReplicateSC,
-    VPScalarCastSC,
     VPScalarIVStepsSC,
     VPVectorPointerSC,
     VPVectorEndPointerSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index f7fa659ba6a8a..3f10b1756d7a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -147,8 +147,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
             [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
         .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
             [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
-        .Case<VPScalarCastRecipe>(
-            [&](const VPScalarCastRecipe *S) { return VerifyEVLUse(*S, 0); })
+        .Case<VPInstructionWithType>(
+            [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
         .Case<VPInstruction>([&](const VPInstruction *I) {
           if (I->getOpcode() == Instruction::PHI)
             return VerifyEVLUse(*I, 1);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
index cfef5153b44f6..2dd12f70f128d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
@@ -35,7 +35,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[SMAX]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -92,7 +92,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[SMIN]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -149,7 +149,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[UMAX]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -206,7 +206,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[UMIN]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -260,7 +260,7 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[CTLZ]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -312,7 +312,7 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[CTTZ]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -366,7 +366,7 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -422,7 +422,7 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -476,7 +476,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[ABS]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
index ac9ed9f9a1fdb..ada84d4ef833f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
@@ -31,7 +31,7 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[SEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -85,7 +85,7 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[ZEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -137,7 +137,7 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -189,7 +189,7 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -241,7 +241,7 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -293,7 +293,7 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[SITOFP]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -345,7 +345,7 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[UITOFP]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -397,7 +397,7 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTOSI]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -449,7 +449,7 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTOUI]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -501,7 +501,7 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[INTTOPTR]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
index ed25e1e03906b..aea19dd261799 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
@@ -19,7 +19,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT: Successor(s): vector.ph
 ; IF-EVL-EMPTY:
 ; IF-EVL: vector.ph:
-; IF-EVL-NEXT:  SCALAR-CAST vp<[[VF32:%[0-9]+]]> = trunc vp<[[VF]]> to i32
+; IF-EVL-NEXT:  EMIT vp<[[VF32:%[0-9]+]]> = trunc vp<[[VF]]> to i32
 ; IF-EVL-NEXT: Successor(s): vector loop
 ; IF-EVL-EMPTY:
 ; IF-EVL: <x1> vector loop: {
@@ -39,7 +39,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds nuw ir<%B>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[ADD]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 6546fccb14fc2..bfaa09241e30b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -52,7 +52,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
 ; IF-EVL-OUTLOOP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]>
 ; IF-EVL-OUTLOOP-NEXT:    WIDEN-INTRINSIC vp<[[RDX_SELECT]]> = call llvm.vp.merge(ir<true>, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]>)
-; IF-EVL-OUTLOOP-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-OUTLOOP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -92,7 +92,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
 ; IF-EVL-INLOOP-NEXT:    WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
 ; IF-EVL-INLOOP-NEXT:    REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + vp.reduce.add (ir<[[LD1]]>, vp<[[EVL]]>)
-; IF-EVL-INLOOP-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-INLOOP-NEXT:    EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index f6bd96affe885..b0b69c74a2299 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -40,7 +40,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:    WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:    EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 2f18b4b817a05..e8d10356dd702 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -49,7 +49,7 @@
  ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]>
  ; IF-EVL-NEXT:     vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
  ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
- ; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+ ; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
  ; IF-EVL-NEXT:     EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
  ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT]]> = add vp<[[IV]]>, ir<[[VFUF]]>
  ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  ir<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 9da4aab9c5ce2..671ed2af52621 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -202,14 +202,14 @@ exit:
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
-; DBG-NEXT:   SCALAR-CAST vp<[[CAST:%.+]]> = trunc ir<1> to i32
+; DBG-NEXT:   EMIT vp<[[CAST:%.+]]> = trunc ir<1> to i32
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {
 ; DBG-NEXT:   vector.body:
 ; DBG-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; DBG-NEXT:     FIRST-ORDER-RECURRENCE-PHI ir<%for> = phi ir<0>, vp<[[SCALAR_STEPS:.+]]>
-; DBG-NEXT:     SCALAR-CAST vp<[[TRUNC_IV:%.+]]> = trunc vp<[[CAN_IV]]> to i32
+; DBG-NEXT:     EMIT vp<[[TRUNC_IV:%.+]]> = trunc vp<[[CAN_IV]]> to i32
 ; DBG-NEXT:     vp<[[SCALAR_STEPS]]> = SCALAR-STEPS vp<[[TRUNC_IV]]>, vp<[[CAST]]>, vp<[[VF]]
 ; DBG-NEXT:     EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%for>, vp<[[SCALAR_STEPS]]>
 ; DBG-NEXT:     CLONE store vp<[[SPLICE]]>, ir<%dst>

>From 8b7ede5f7e350b440037cf9edd795df8586b37b7 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 10 Apr 2025 14:34:15 -0700
Subject: [PATCH 34/38] [RISCV] Add coverage for missing vrgather.vi shuffle
 case

If we have a near identity shuffle with a single element repeated, we
manage to match this as a masked vrgather.vi for the two operand
forms, but not the single operand form.  If the scalar being repeated
was a scalar just inserted into the vector, we're also missing a
chance to recognize a vmerge.vxm or vmerge.vim in both cases.
---
 .../RISCV/rvv/fixed-vectors-shuffle-int.ll    | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
index 2020f3dc6bad2..b26fc5653afec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
@@ -1415,3 +1415,67 @@ define <8 x i32> @shuffle_v8i32_locally_repeating_neg(<8 x i32> %a) {
   %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 poison, i32 poison, i32 5, i32 4, i32 6, i32 6>
   ret <8 x i32> %res
 }
+
+define <8 x i8> @identity_splat0(<8 x i8> %v) {
+; CHECK-LABEL: identity_splat0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI88_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI88_0)
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %shuf = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 0, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuf
+}
+
+define <8 x i8> @identity_splat2(<8 x i8> %v) {
+; CHECK-LABEL: identity_splat2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI89_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI89_0)
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %shuf = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuf
+}
+
+
+define <8 x i8> @vmerge_vxm(<8 x i8> %v, i8 %s) {
+; CHECK-LABEL: vmerge_vxm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, %hi(.LCPI90_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI90_0)
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v10, (a1)
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %ins = insertelement <8 x i8> %v, i8 %s, i32 0
+  %shuf = shufflevector <8 x i8> %ins, <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 0, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuf
+}
+
+define <8 x i8> @vmerge_vxm2(<8 x i8> %v, i8 %s) {
+; CHECK-LABEL: vmerge_vxm2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vi v8, v9, 0, v0.t
+; CHECK-NEXT:    ret
+  %ins = insertelement <8 x i8> %v, i8 %s, i32 0
+  %shuf = shufflevector <8 x i8> %v, <8 x i8> %ins, <8 x i32> <i32 8, i32 1, i32 2, i32 8, i32 8, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuf
+}
+

>From e581c37907f061228af77f944b05dcb6ab05b92d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes at amd.com>
Date: Thu, 10 Apr 2025 14:43:42 -0700
Subject: [PATCH 35/38] [AMDGPU] Make the iterative schedulers selectable via
 amdgpu-sched-strategy (#135042)

Currently, the only way for users to try these schedulers is via
`-misched=` . However, this overrides the default scheduler for all
targets. This causes problems for various toolchains / drivers which
spawn jobs for both x86 and AMDGPU -- e.g. hipcc. On the other hand,
`amdgpu-sched-strategy` only changes the scheduler for AMDGPU target.
---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp          | 9 +++++++++
 llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll | 4 ++++
 llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll | 1 +
 3 files changed, 14 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2c6729f02ae64..e06d5a08e06a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1088,6 +1088,15 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
   if (SchedStrategy == "max-memory-clause")
     return createGCNMaxMemoryClauseMachineScheduler(C);
 
+  if (SchedStrategy == "iterative-ilp")
+    return createIterativeILPMachineScheduler(C);
+
+  if (SchedStrategy == "iterative-minreg")
+    return createMinRegScheduler(C);
+
+  if (SchedStrategy == "iterative-maxocc")
+    return createIterativeGCNMaxOccupancyMachineScheduler(C);
+
   return createGCNMaxOccupancyMachineScheduler(C);
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 2c9d24ee04ebf..7d771342a598e 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -1,7 +1,11 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
 
 ; SI-MINREG: NumSgprs: {{[1-9]$}}
 ; SI-MINREG: NumVgprs: {{[1-9]$}}
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
index 96b40bca5e2e3..ef24996d00274 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s
 
 ; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP.
 

>From 1d0aab7e2572e688538fa6b9c8bedbb25c10763c Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane at nvidia.com>
Date: Thu, 10 Apr 2025 14:47:48 -0700
Subject: [PATCH 36/38] [OpenACC] device_type on set should have only 1
 architecture

Discussions with the OpenACC Standard folks and the dialect folks showed
that the ability to have 'set' have a 'device_type' with more than one
architecture was a mistake, and one that will be fixed in future
revisions of the standard.  Since the dialect requires this anyway,
we'll implement this in advance of standardization.
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  3 +++
 clang/lib/Sema/SemaOpenACCClause.cpp             | 10 ++++++++++
 clang/test/SemaOpenACC/set-construct.cpp         |  8 ++++++++
 3 files changed, 21 insertions(+)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index fdf3f8816e746..3cb2731488fab 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13063,6 +13063,9 @@ def err_acc_invalid_modifier
     : Error<"OpenACC '%0' modifier not valid on '%1' clause">;
 def err_acc_invalid_default_type
     : Error<"invalid value %0 in '%1' clause; valid values are %2">;
+def err_acc_device_type_multiple_archs
+    : Error<"OpenACC 'device_type' clause on a 'set' construct only permits "
+            "one architecture">;
 
 // AMDGCN builtins diagnostics
 def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
index 8aa0c52f0ce19..ab25dcfd1a081 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -1338,6 +1338,16 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause(
       checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
     return nullptr;
 
+  // Based on discussions, having more than 1 'architecture' on a 'set' is
+  // nonsensical, so we're going to fix the standard to reflect this.  Implement
+  // the limitation, since the Dialect requires this.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set &&
+      Clause.getDeviceTypeArchitectures().size() > 1) {
+    SemaRef.Diag(Clause.getDeviceTypeArchitectures()[1].second,
+                 diag::err_acc_device_type_multiple_archs);
+    return nullptr;
+  }
+
   // The list of valid device_type values. Flang also has these hardcoded in
   // openacc_parsers.cpp, as there does not seem to be a reliable backend
   // source. The list below is sourced from Flang, though NVC++ supports only
diff --git a/clang/test/SemaOpenACC/set-construct.cpp b/clang/test/SemaOpenACC/set-construct.cpp
index 9a90ea67b1bd2..ac141d097532b 100644
--- a/clang/test/SemaOpenACC/set-construct.cpp
+++ b/clang/test/SemaOpenACC/set-construct.cpp
@@ -66,4 +66,12 @@ void uses() {
   // expected-error at +2{{'if' clause cannot appear more than once on a 'set' directive}}
   // expected-note at +1{{previous clause is here}}
 #pragma acc set device_type(acc_device_nvidia) if(true) if (true)
+
+  // expected-error at +2{{OpenACC 'device_type' clause on a 'set' construct only permits one architecture}}
+  // expected-error at +1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
+#pragma acc set device_type(nvidia, radeon)
+
+  // expected-error at +2{{OpenACC 'device_type' clause on a 'set' construct only permits one architecture}}
+  // expected-error at +1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
+#pragma acc set device_type(nonsense, nvidia, radeon)
 }

>From 06eb14768cf7df8447c9ff21650b546c98e24f9e Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96 at programmer.net>
Date: Fri, 11 Apr 2025 00:05:22 +0200
Subject: [PATCH 37/38] [CIR] Upstream ArraySubscriptExpr for fixed size array
 (#134536)

This change adds ArraySubscriptExpr for fixed size ArrayType

Issue #130197
---
 clang/include/clang/CIR/MissingFeatures.h  |   2 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.cpp    |  40 ++
 clang/lib/CIR/CodeGen/CIRGenBuilder.h      |  13 +
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp       | 138 +++++++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp |  10 +
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp   |   2 +
 clang/lib/CIR/CodeGen/CIRGenFunction.h     |   2 +
 clang/lib/CIR/CodeGen/CIRGenValue.h        |   2 +
 clang/lib/CIR/CodeGen/CMakeLists.txt       |   1 +
 clang/test/CIR/CodeGen/array.cpp           | 405 ++++++++++++++++-----
 clang/test/CIR/Lowering/array.cpp          |  40 +-
 11 files changed, 556 insertions(+), 99 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenBuilder.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index bacb7879a5527..c39590421b647 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -141,6 +141,8 @@ struct MissingFeatures {
   static bool mangledNames() { return false; }
   static bool setDLLStorageClass() { return false; }
   static bool openMP() { return false; }
+  static bool emitCheckedInBoundsGEP() { return false; }
+  static bool preservedAccessIndexRegion() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
new file mode 100644
index 0000000000000..2f1940e656172
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenBuilder.h"
+
+using namespace clang::CIRGen;
+
+mlir::Value CIRGenBuilderTy::maybeBuildArrayDecay(mlir::Location loc,
+                                                  mlir::Value arrayPtr,
+                                                  mlir::Type eltTy) {
+  const auto arrayPtrTy = mlir::cast<cir::PointerType>(arrayPtr.getType());
+  const auto arrayTy = mlir::dyn_cast<cir::ArrayType>(arrayPtrTy.getPointee());
+
+  if (arrayTy) {
+    const cir::PointerType flatPtrTy = getPointerTo(arrayTy.getEltType());
+    return create<cir::CastOp>(loc, flatPtrTy, cir::CastKind::array_to_ptrdecay,
+                               arrayPtr);
+  }
+
+  assert(arrayPtrTy.getPointee() == eltTy &&
+         "flat pointee type must match original array element type");
+  return arrayPtr;
+}
+
+mlir::Value CIRGenBuilderTy::getArrayElement(mlir::Location arrayLocBegin,
+                                             mlir::Location arrayLocEnd,
+                                             mlir::Value arrayPtr,
+                                             mlir::Type eltTy, mlir::Value idx,
+                                             bool shouldDecay) {
+  mlir::Value basePtr = arrayPtr;
+  if (shouldDecay)
+    basePtr = maybeBuildArrayDecay(arrayLocBegin, arrayPtr, eltTy);
+  const mlir::Type flatPtrTy = basePtr.getType();
+  return create<cir::PtrStrideOp>(arrayLocEnd, flatPtrTy, basePtr, idx);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 61a747254b3d0..0d8568742e960 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -198,6 +198,19 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
 
     return create<cir::BinOp>(loc, cir::BinOpKind::Div, lhs, rhs);
   }
+
+  /// Create a cir.ptr_stride operation to get access to an array element.
+  /// \p idx is the index of the element to access, \p shouldDecay is true if
+  /// the result should decay to a pointer to the element type.
+  mlir::Value getArrayElement(mlir::Location arrayLocBegin,
+                              mlir::Location arrayLocEnd, mlir::Value arrayPtr,
+                              mlir::Type eltTy, mlir::Value idx,
+                              bool shouldDecay);
+
+  /// Returns a decayed pointer to the first element of the array
+  /// pointed to by \p arrayPtr.
+  mlir::Value maybeBuildArrayDecay(mlir::Location loc, mlir::Value arrayPtr,
+                                   mlir::Type eltTy);
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 4c20170d75131..b38ed4d0a14e8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -12,6 +12,7 @@
 
 #include "Address.h"
 #include "CIRGenFunction.h"
+#include "CIRGenModule.h"
 #include "CIRGenValue.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "clang/AST/Attr.h"
@@ -430,6 +431,143 @@ LValue CIRGenFunction::emitUnaryOpLValue(const UnaryOperator *e) {
   llvm_unreachable("Unknown unary operator kind!");
 }
 
+/// If the specified expr is a simple decay from an array to pointer,
+/// return the array subexpression.
+/// FIXME: this could be abstracted into a common AST helper.
+static const Expr *getSimpleArrayDecayOperand(const Expr *e) {
+  // If this isn't just an array->pointer decay, bail out.
+  const auto *castExpr = dyn_cast<CastExpr>(e);
+  if (!castExpr || castExpr->getCastKind() != CK_ArrayToPointerDecay)
+    return nullptr;
+
+  // If this is a decay from variable width array, bail out.
+  const Expr *subExpr = castExpr->getSubExpr();
+  if (subExpr->getType()->isVariableArrayType())
+    return nullptr;
+
+  return subExpr;
+}
+
+static cir::IntAttr getConstantIndexOrNull(mlir::Value idx) {
+  // TODO(cir): should we consider using MLIRs IndexType instead of IntegerAttr?
+  if (auto constantOp = dyn_cast<cir::ConstantOp>(idx.getDefiningOp()))
+    return mlir::dyn_cast<cir::IntAttr>(constantOp.getValue());
+  return {};
+}
+
+static CharUnits getArrayElementAlign(CharUnits arrayAlign, mlir::Value idx,
+                                      CharUnits eltSize) {
+  // If we have a constant index, we can use the exact offset of the
+  // element we're accessing.
+  const cir::IntAttr constantIdx = getConstantIndexOrNull(idx);
+  if (constantIdx) {
+    const CharUnits offset = constantIdx.getValue().getZExtValue() * eltSize;
+    return arrayAlign.alignmentAtOffset(offset);
+  }
+  // Otherwise, use the worst-case alignment for any element.
+  return arrayAlign.alignmentOfArrayElement(eltSize);
+}
+
+static QualType getFixedSizeElementType(const ASTContext &astContext,
+                                        const VariableArrayType *vla) {
+  QualType eltType;
+  do {
+    eltType = vla->getElementType();
+  } while ((vla = astContext.getAsVariableArrayType(eltType)));
+  return eltType;
+}
+
+static mlir::Value emitArraySubscriptPtr(CIRGenFunction &cgf,
+                                         mlir::Location beginLoc,
+                                         mlir::Location endLoc, mlir::Value ptr,
+                                         mlir::Type eltTy, mlir::Value idx,
+                                         bool shouldDecay) {
+  CIRGenModule &cgm = cgf.getCIRGenModule();
+  // TODO(cir): LLVM codegen emits in bound gep check here, is there anything
+  // that would enhance tracking this later in CIR?
+  assert(!cir::MissingFeatures::emitCheckedInBoundsGEP());
+  return cgm.getBuilder().getArrayElement(beginLoc, endLoc, ptr, eltTy, idx,
+                                          shouldDecay);
+}
+
+static Address emitArraySubscriptPtr(CIRGenFunction &cgf,
+                                     mlir::Location beginLoc,
+                                     mlir::Location endLoc, Address addr,
+                                     QualType eltType, mlir::Value idx,
+                                     mlir::Location loc, bool shouldDecay) {
+
+  // Determine the element size of the statically-sized base.  This is
+  // the thing that the indices are expressed in terms of.
+  if (const VariableArrayType *vla =
+          cgf.getContext().getAsVariableArrayType(eltType)) {
+    eltType = getFixedSizeElementType(cgf.getContext(), vla);
+  }
+
+  // We can use that to compute the best alignment of the element.
+  const CharUnits eltSize = cgf.getContext().getTypeSizeInChars(eltType);
+  const CharUnits eltAlign =
+      getArrayElementAlign(addr.getAlignment(), idx, eltSize);
+
+  assert(!cir::MissingFeatures::preservedAccessIndexRegion());
+  const mlir::Value eltPtr =
+      emitArraySubscriptPtr(cgf, beginLoc, endLoc, addr.getPointer(),
+                            addr.getElementType(), idx, shouldDecay);
+  const mlir::Type elementType = cgf.convertTypeForMem(eltType);
+  return Address(eltPtr, elementType, eltAlign);
+}
+
+LValue
+CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) {
+  if (e->getBase()->getType()->isVectorType() &&
+      !isa<ExtVectorElementExpr>(e->getBase())) {
+    cgm.errorNYI(e->getSourceRange(), "emitArraySubscriptExpr: VectorType");
+    return LValue::makeAddr(Address::invalid(), e->getType(), LValueBaseInfo());
+  }
+
+  if (isa<ExtVectorElementExpr>(e->getBase())) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitArraySubscriptExpr: ExtVectorElementExpr");
+    return LValue::makeAddr(Address::invalid(), e->getType(), LValueBaseInfo());
+  }
+
+  if (getContext().getAsVariableArrayType(e->getType())) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitArraySubscriptExpr: VariableArrayType");
+    return LValue::makeAddr(Address::invalid(), e->getType(), LValueBaseInfo());
+  }
+
+  if (e->getType()->getAs<ObjCObjectType>()) {
+    cgm.errorNYI(e->getSourceRange(), "emitArraySubscriptExpr: ObjCObjectType");
+    return LValue::makeAddr(Address::invalid(), e->getType(), LValueBaseInfo());
+  }
+
+  // The index must always be an integer, which is not an aggregate.  Emit it
+  // in lexical order (this complexity is, sadly, required by C++17).
+  assert((e->getIdx() == e->getLHS() || e->getIdx() == e->getRHS()) &&
+         "index was neither LHS nor RHS");
+  const mlir::Value idx = emitScalarExpr(e->getIdx());
+  if (const Expr *array = getSimpleArrayDecayOperand(e->getBase())) {
+    LValue arrayLV;
+    if (const auto *ase = dyn_cast<ArraySubscriptExpr>(array))
+      arrayLV = emitArraySubscriptExpr(ase);
+    else
+      arrayLV = emitLValue(array);
+
+    // Propagate the alignment from the array itself to the result.
+    const Address addr = emitArraySubscriptPtr(
+        *this, cgm.getLoc(array->getBeginLoc()), cgm.getLoc(array->getEndLoc()),
+        arrayLV.getAddress(), e->getType(), idx, cgm.getLoc(e->getExprLoc()),
+        /*shouldDecay=*/true);
+
+    return LValue::makeAddr(addr, e->getType(), LValueBaseInfo());
+  }
+
+  // The base must be a pointer; emit it with an estimate of its alignment.
+  cgm.errorNYI(e->getSourceRange(),
+               "emitArraySubscriptExpr: The base must be a pointer");
+  return {};
+}
+
 LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
   // Comma expressions just emit their LHS then their RHS as an l-value.
   if (e->getOpcode() == BO_Comma) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 94b5d955715ae..c2b4110a772a0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -158,6 +158,16 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   mlir::Value VisitCastExpr(CastExpr *e);
   mlir::Value VisitCallExpr(const CallExpr *e);
 
+  mlir::Value VisitArraySubscriptExpr(ArraySubscriptExpr *e) {
+    if (e->getBase()->getType()->isVectorType()) {
+      assert(!cir::MissingFeatures::scalableVectors());
+      cgf.getCIRGenModule().errorNYI("VisitArraySubscriptExpr: VectorType");
+      return {};
+    }
+    // Just load the lvalue formed by the subscript expression.
+    return emitLoadOfLValue(e);
+  }
+
   mlir::Value VisitExplicitCastExpr(ExplicitCastExpr *e) {
     return VisitCastExpr(e);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index b123fc3bb1117..9dace721e7417 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -509,6 +509,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
                                std::string("l-value not implemented for '") +
                                    e->getStmtClassName() + "'");
     return LValue();
+  case Expr::ArraySubscriptExprClass:
+    return emitArraySubscriptExpr(cast<ArraySubscriptExpr>(e));
   case Expr::UnaryOperatorClass:
     return emitUnaryOpLValue(cast<UnaryOperator>(e));
   case Expr::BinaryOperatorClass:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 311aca8cd3f6f..17aa68492f983 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -434,6 +434,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// should be returned.
   RValue emitAnyExpr(const clang::Expr *e);
 
+  LValue emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e);
+
   AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d);
 
   /// Emit code and set up symbol table for a variable declaration with auto,
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 1b702daae4b4c..d4d6f5a44622e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -23,6 +23,8 @@
 
 #include "mlir/IR/Value.h"
 
+#include "clang/CIR/MissingFeatures.h"
+
 namespace clang::CIRGen {
 
 /// This trivial value class is used to represent the result of an
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 16534d9a72587..dc18f7f2af160 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -8,6 +8,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_clang_library(clangCIR
   CIRGenerator.cpp
+  CIRGenBuilder.cpp
   CIRGenCall.cpp
   CIRGenDecl.cpp
   CIRGenDeclOpenACC.cpp
diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
index 0d28ebc66f83c..431164c797f76 100644
--- a/clang/test/CIR/CodeGen/array.cpp
+++ b/clang/test/CIR/CodeGen/array.cpp
@@ -1,141 +1,368 @@
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
 
 int a[10];
-// CHECK: cir.global external @a = #cir.zero : !cir.array<!s32i x 10>
+// CIR: cir.global external @a = #cir.zero : !cir.array<!s32i x 10>
+
+// LLVM: @a = dso_local global [10 x i32] zeroinitializer
+
+// OGCG: @a = global [10 x i32] zeroinitializer
 
 int aa[10][5];
-// CHECK: cir.global external @aa = #cir.zero : !cir.array<!cir.array<!s32i x 5> x 10>
+// CIR: cir.global external @aa = #cir.zero : !cir.array<!cir.array<!s32i x 5> x 10>
+
+// LLVM: @aa = dso_local global [10 x [5 x i32]] zeroinitializer
+
+// OGCG: @aa = global [10 x [5 x i32]] zeroinitializer
 
 extern int b[10];
-// CHECK: cir.global external @b = #cir.zero : !cir.array<!s32i x 10>
+// CIR: cir.global external @b = #cir.zero : !cir.array<!s32i x 10>
+
+// LLVM: @b = dso_local global [10 x i32] zeroinitializer
 
 extern int bb[10][5];
-// CHECK: cir.global external @bb = #cir.zero : !cir.array<!cir.array<!s32i x 5> x 10>
+// CIR: cir.global external @bb = #cir.zero : !cir.array<!cir.array<!s32i x 5> x 10>
+
+// LLVM: @bb = dso_local global [10 x [5 x i32]] zeroinitializer
 
 int c[10] = {};
-// CHECK: cir.global external @c = #cir.zero : !cir.array<!s32i x 10>
+// CIR: cir.global external @c = #cir.zero : !cir.array<!s32i x 10>
+
+// LLVM: @c = dso_local global [10 x i32] zeroinitializer
+
+// OGCG: @c = global [10 x i32] zeroinitializer
 
 int d[3] = {1, 2, 3};
-// CHECK: cir.global external @d = #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i]> : !cir.array<!s32i x 3>
+// CIR: cir.global external @d = #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i]> : !cir.array<!s32i x 3>
+
+// LLVM: @d = dso_local global [3 x i32] [i32 1, i32 2, i32 3]
+
+// OGCG: @d = global [3 x i32] [i32 1, i32 2, i32 3]
 
 int dd[3][2] = {{1, 2}, {3, 4}, {5, 6}};
-// CHECK: cir.global external @dd = #cir.const_array<[#cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i]> : !cir.array<!s32i x 2>, #cir.const_array<[#cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.array<!s32i x 2>, #cir.const_array<[#cir.int<5> : !s32i, #cir.int<6> : !s32i]> : !cir.array<!s32i x 2>]> : !cir.array<!cir.array<!s32i x 2> x 3>
+// CIR: cir.global external @dd = #cir.const_array<[#cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i]> : !cir.array<!s32i x 2>, #cir.const_array<[#cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.array<!s32i x 2>, #cir.const_array<[#cir.int<5> : !s32i, #cir.int<6> : !s32i]> : !cir.array<!s32i x 2>]> : !cir.array<!cir.array<!s32i x 2> x 3>
+
+// LLVM: @dd = dso_local global [3 x [2 x i32]] [
+// LLVM: [2 x i32] [i32 1, i32 2], [2 x i32]
+// LLVM: [i32 3, i32 4], [2 x i32] [i32 5, i32 6]]
+
+// OGCG: @dd = global [3 x [2 x i32]] [
+// OGCG: [2 x i32] [i32 1, i32 2], [2 x i32]
+// OGCG: [i32 3, i32 4], [2 x i32] [i32 5, i32 6]]
 
 int e[10] = {1, 2};
-// CHECK: cir.global external @e = #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i], trailing_zeros> : !cir.array<!s32i x 10>
+// CIR: cir.global external @e = #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i], trailing_zeros> : !cir.array<!s32i x 10>
+
+// LLVM: @e = dso_local global [10 x i32] [i32 1, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0]
 
 int f[5] = {1, 2};
-// CHECK: cir.global external @f = #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5>
+// CIR: cir.global external @f = #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5>
+
+// LLVM: @f = dso_local global [5 x i32] [i32 1, i32 2, i32 0, i32 0, i32 0]
+
+// OGCG: @f = global [5 x i32] [i32 1, i32 2, i32 0, i32 0, i32 0]
+
+// OGCG: @[[FUN2_ARR:.*]] = private unnamed_addr constant [2 x i32] [i32 5, i32 0], align 4
+// OGCG: @[[FUN3_ARR:.*]] = private unnamed_addr constant [2 x i32] [i32 5, i32 6], align 4
+// OGCG: @[[FUN4_ARR:.*]] = private unnamed_addr constant [2 x [1 x i32]] [
+// OGCG: [1 x i32] [i32 5], [1 x i32] [i32 6]], align 4
+// OGCG: @[[FUN5_ARR:.*]] = private unnamed_addr constant [2 x [1 x i32]] [
+// OGCG: [1 x i32] [i32 5], [1 x i32] zeroinitializer], align 4
 
 void func() {
   int arr[10];
-
-  // CHECK: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["arr"]
+  int e = arr[0];
+  int e2 = arr[1];
 }
 
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["arr"]
+// CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
+// CIR: %[[INIT_2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e2", init]
+// CIR: %[[IDX:.*]] = cir.const #cir.int<0> : !s32i
+// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>>), !cir.ptr<!s32i>
+// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[IDX]] : !s32i), !cir.ptr<!s32i>
+// CIR: %[[TMP:.*]] = cir.load %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
+// CIR" cir.store %[[TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[IDX:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>>), !cir.ptr<!s32i>
+// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[IDX]] : !s32i), !cir.ptr<!s32i>
+// CIR: %[[TMP:.*]] = cir.load %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
+// CIR" cir.store %[[TMP]], %[[INIT_2]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: define void @func()
+// LLVM-NEXT: %[[ARR_ALLOCA:.*]] = alloca [10 x i32], i64 1, align 16
+// LLVM-NEXT: %[[INIT:.*]] = alloca i32, i64 1, align 4
+// LLVM-NEXT: %[[INIT_2:.*]] = alloca i32, i64 1, align 4
+// LLVM-NEXT: %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
+// LLVM-NEXT: %[[ELE_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 0
+// LLVM-NEXT: %[[TMP:.*]] = load i32, ptr %[[ELE_PTR]], align 4
+// LLVM-NEXT: store i32 %[[TMP]], ptr %[[INIT]], align 4
+// LLVM-NEXT: %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
+// LLVM-NEXT: %[[ELE_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
+// LLVM-NEXT: %[[TMP:.*]] = load i32, ptr %[[ELE_PTR]], align 4
+
+// OGCG: %arr = alloca [10 x i32], align 16
+// OGCG: %e = alloca i32, align 4
+// OGCG: %e2 = alloca i32, align 4
+// OGCG: %arrayidx = getelementptr inbounds [10 x i32], ptr %arr, i64 0, i64 0
+// OGCG: %0 = load i32, ptr %arrayidx, align 16
+// OGCG: store i32 %0, ptr %e, align 4
+// OGCG: %arrayidx1 = getelementptr inbounds [10 x i32], ptr %arr, i64 0, i64 1
+// OGCG: %1 = load i32, ptr %arrayidx1, align 4
+// OGCG: store i32 %1, ptr %e2, align 4
+
 void func2() {
   int arr[2] = {5};
-
-  // CHECK: %[[ARR2:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
-  // CHECK: %[[ELE_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp", init]
-  // CHECK: %[[ARR_2_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR2]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
-  // CHECK: %[[V1:.*]] = cir.const #cir.int<5> : !s32i
-  // CHECK: cir.store %[[V1]], %[[ARR_2_PTR]] : !s32i, !cir.ptr<!s32i>
-  // CHECK: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
-  // CHECK: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_2_PTR]] : !cir.ptr<!s32i>, %[[OFFSET_0]] : !s64i), !cir.ptr<!s32i>
-  // CHECK: cir.store %[[ELE_PTR]], %[[ELE_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-  // CHECK: %[[LOAD_1:.*]] = cir.load %[[ELE_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-  // CHECK: %[[V2:.*]] = cir.const #cir.int<0> : !s32i
-  // CHECK: cir.store %[[V2]], %[[LOAD_1]] : !s32i, !cir.ptr<!s32i>
-  // CHECK: %[[OFFSET_1:.*]] = cir.const #cir.int<1> : !s64i
-  // CHECK: %[[ELE_1_PTR:.*]] = cir.ptr_stride(%[[LOAD_1]] : !cir.ptr<!s32i>, %[[OFFSET_1]] : !s64i), !cir.ptr<!s32i>
-  // CHECK: cir.store %[[ELE_1_PTR]], %[[ELE_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 }
 
+// CIR: %[[ARR2:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
+// CIR: %[[ELE_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp", init]
+// CIR: %[[ARR_2_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR2]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[V1:.*]] = cir.const #cir.int<5> : !s32i
+// CIR: cir.store %[[V1]], %[[ARR_2_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_2_PTR]] : !cir.ptr<!s32i>, %[[OFFSET_0]] : !s64i), !cir.ptr<!s32i>
+// CIR: cir.store %[[ELE_PTR]], %[[ELE_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR: %[[LOAD_1:.*]] = cir.load %[[ELE_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
+// CIR: %[[V2:.*]] = cir.const #cir.int<0> : !s32i
+// CIR: cir.store %[[V2]], %[[LOAD_1]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[OFFSET_1:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %[[ELE_1_PTR:.*]] = cir.ptr_stride(%[[LOAD_1]] : !cir.ptr<!s32i>, %[[OFFSET_1]] : !s64i), !cir.ptr<!s32i>
+// CIR: cir.store %[[ELE_1_PTR]], %[[ELE_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+
+// LLVM: define void @func2()
+// LLVM:  %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4
+// LLVM:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
+// LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
+// LLVM:  store i32 5, ptr %[[ARR_PTR]], align 4
+// LLVM:  %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
+// LLVM:  store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
+// LLVM:  %[[TMP2:.*]] = load ptr, ptr %[[TMP]], align 8
+// LLVM:  store i32 0, ptr %[[TMP2]], align 4
+// LLVM:  %[[ELE_1:.*]] = getelementptr i32, ptr %[[TMP2]], i64 1
+// LLVM:  store ptr %[[ELE_1]], ptr %[[TMP]], align 8
+
+// OGCG: %arr = alloca [2 x i32], align 4
+// OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %arr, ptr align 4 @[[FUN2_ARR]], i64 8, i1 false)
+
 void func3() {
   int arr[2] = {5, 6};
 
-  // CHECK: %[[ARR3:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
-  // CHECK: %[[ARR_3_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR3]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
-  // CHECK: %[[V0:.*]] = cir.const #cir.int<5> : !s32i
-  // CHECK: cir.store %[[V0]], %[[ARR_3_PTR]] : !s32i, !cir.ptr<!s32i>
-  // CHECK: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
-  // CHECK: %[[ELE_1_PTR:.*]] = cir.ptr_stride(%[[ARR_3_PTR]] : !cir.ptr<!s32i>, %[[OFFSET_0]] : !s64i), !cir.ptr<!s32i>
-  // CHECK: %[[V1:.*]] = cir.const #cir.int<6> : !s32i
-  // CHECK: cir.store %[[V1]], %[[ELE_1_PTR]] : !s32i, !cir.ptr<!s32i>
+  int idx = 1;
+  int e = arr[idx];
 }
 
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
+// CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
+// CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
+// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[V0:.*]] = cir.const #cir.int<5> : !s32i
+// CIR: cir.store %[[V0]], %[[ARR_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %[[ELE_1_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[OFFSET_0]] : !s64i), !cir.ptr<!s32i>
+// CIR: %[[V1:.*]] = cir.const #cir.int<6> : !s32i
+// CIR: cir.store %[[V1]], %[[ELE_1_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[IDX_V:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store %[[IDX_V]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[TMP_IDX:.*]] = cir.load %[[IDX]] : !cir.ptr<!s32i>, !s32i
+// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[TMP_IDX]] : !s32i), !cir.ptr<!s32i>
+// CIR: %[[ELE_TMP:.*]] = cir.load %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.store %[[ELE_TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: define void @func3()
+// LLVM:  %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4
+// LLVM:  %[[IDX:.*]] = alloca i32, i64 1, align 4
+// LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
+// LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
+// LLVM:  store i32 5, ptr %[[ARR_PTR]], align 4
+// LLVM:  %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
+// LLVM:  store i32 6, ptr %[[ELE_1_PTR]], align 4
+// LLVM:  store i32 1, ptr %[[IDX]], align 4
+// LLVM:  %[[TMP1:.*]] = load i32, ptr %[[IDX]], align 4
+// LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
+// LLVM:  %[[IDX_I64:.*]] = sext i32 %[[TMP1]] to i64
+// LLVM:  %[[ELE:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 %[[IDX_I64]]
+// LLVM:  %[[TMP2:.*]] = load i32, ptr %[[ELE]], align 4
+// LLVM:  store i32 %[[TMP2]], ptr %[[INIT]], align 4
+
+// OGCG:  %arr = alloca [2 x i32], align 4
+// OGCG:  %idx = alloca i32, align 4
+// OGCG:  %e = alloca i32, align 4
+// OGCG:  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %arr, ptr align 4 @[[FUN3_ARR]], i64 8, i1 false)
+// OGCG:  store i32 1, ptr %idx, align 4
+// OGCG:  %0 = load i32, ptr %idx, align 4
+// OGCG:  %idxprom = sext i32 %0 to i64
+// OGCG:  %arrayidx = getelementptr inbounds [2 x i32], ptr %arr, i64 0, i64 %idxprom
+// OGCG:  %1 = load i32, ptr %arrayidx, align 4
+// OGCG:  store i32 %1, ptr %e, align 4
+
 void func4() {
   int arr[2][1] = {{5}, {6}};
-
-  // CHECK: %[[ARR:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>, ["arr", init]
-  // CHECK: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>), !cir.ptr<!cir.array<!s32i x 1>>
-  // CHECK: %[[ARR_0_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
-  // CHECK: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
-  // CHECK: cir.store %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
-  // CHECK: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
-  // CHECK: %[[ARR_1:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
-  // CHECK: %[[ARR_1_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
-  // CHECK: %[[V_1_0:.*]] = cir.const #cir.int<6> : !s32i
-  // CHECK: cir.store %[[V_1_0]], %[[ARR_1_PTR]] : !s32i, !cir.ptr<!s32i>
+  int e = arr[1][0];
 }
 
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>, ["arr", init]
+// CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
+// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[ARR_0_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
+// CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
+// CIR: cir.store %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %[[ARR_1:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[ARR_1_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
+// CIR: %[[V_1_0:.*]] = cir.const #cir.int<6> : !s32i
+// CIR: cir.store %[[V_1_0]], %[[ARR_1_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[IDX:.*]] = cir.const #cir.int<0> : !s32i
+// CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[ARR_1:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[IDX_1]] : !s32i), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[ARR_1_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
+// CIR: %[[ELE_0:.*]] = cir.ptr_stride(%[[ARR_1_PTR]] : !cir.ptr<!s32i>, %[[IDX]] : !s32i), !cir.ptr<!s32i>
+// CIR: %[[TMP:.*]] = cir.load %[[ELE_0]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.store %[[TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: define void @func4()
+// LLVM:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
+// LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
+// LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
+// LLVM:  %[[ARR_0_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
+// LLVM:  store i32 5, ptr %[[ARR_0_0]], align 4
+// LLVM:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
+// LLVM:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
+// LLVM:  store i32 6, ptr %[[ARR_1_0]], align 4
+// LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
+// LLVM:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
+// LLVM:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
+// LLVM:  %[[ELE_PTR:.*]] = getelementptr i32, ptr %[[ARR_1_0]], i64 0
+// LLVM:  %[[TMP:.*]] = load i32, ptr %[[ELE_PTR]], align 4
+// LLVM:  store i32 %[[TMP]], ptr %[[INIT]], align 4
+
+// OGCG: %arr = alloca [2 x [1 x i32]], align 4
+// OGCG: %e = alloca i32, align 4
+// OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %arr, ptr align 4 @[[FUN4_ARR]], i64 8, i1 false)
+// OGCG: %arrayidx = getelementptr inbounds [2 x [1 x i32]], ptr %arr, i64 0, i64 1
+// OGCG: %arrayidx1 = getelementptr inbounds [1 x i32], ptr %arrayidx, i64 0, i64 0
+// OGCG: %0 = load i32, ptr %arrayidx1, align 4
+// OGCG: store i32 %0, ptr %e, align 4
+
 void func5() {
   int arr[2][1] = {{5}};
-
-  // CHECK: %[[ARR:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>, ["arr", init]
-  // CHECK: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, ["arrayinit.temp", init]
-  // CHECK: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %0 : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>), !cir.ptr<!cir.array<!s32i x 1>>
-  // CHECK: %[[ARR_0_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
-  // CHECK: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
-  // CHECK: cir.store %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
-  // CHECK: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
-  // CHECK: %6 = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
-  // CHECK: cir.store %6, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
-  // CHECK: %7 = cir.load %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, !cir.ptr<!cir.array<!s32i x 1>>
-  // CHECK: %8 = cir.const #cir.zero : !cir.array<!s32i x 1>
-  // CHECK: cir.store %8, %7 : !cir.array<!s32i x 1>, !cir.ptr<!cir.array<!s32i x 1>>
-  // CHECK: %[[OFFSET_1:.*]] = cir.const #cir.int<1> : !s64i
-  // CHECK: %10 = cir.ptr_stride(%7 : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET_1]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
-  // CHECK: cir.store %10, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
 }
 
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>, ["arr", init]
+// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, ["arrayinit.temp", init]
+// CIR: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %0 : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[ARR_0_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
+// CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
+// CIR: cir.store %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %6 = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: cir.store %6, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
+// CIR: %7 = cir.load %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %8 = cir.const #cir.zero : !cir.array<!s32i x 1>
+// CIR: cir.store %8, %7 : !cir.array<!s32i x 1>, !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[OFFSET_1:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %10 = cir.ptr_stride(%7 : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET_1]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: cir.store %10, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
+
+// LLVM: define void @func5()
+// LLVM:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
+// LLVM:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
+// LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
+// LLVM:  %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
+// LLVM:  store i32 5, ptr %[[ARR_0]], align 4
+// LLVM:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
+// LLVM:  store ptr %[[ARR_1]], ptr %[[TMP]], align 8
+// LLVM:  %[[ARR_1_VAL:.*]] = load ptr, ptr %[[TMP]], align 8
+// LLVM:  store [1 x i32] zeroinitializer, ptr %[[ARR_1_VAL]], align 4
+// LLVM:  %[[ARR_1_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_1_VAL]], i64 1
+// LLVM:  store ptr %[[ARR_1_PTR]], ptr %[[TMP]], align 8
+
+// ORGC: %arr = alloca [2 x [1 x i32]], align 4
+// ORGC: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %arr, ptr align 4 @[[FUN5_ARR]], i64 8, i1 false)
+
 void func6() {
   int x = 4;
   int arr[2] = { x, 5 };
-
-  // CHECK: %[[VAR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
-  // CHECK: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
-  // CHECK: %[[V:.*]] = cir.const #cir.int<4> : !s32i
-  // CHECK: cir.store %[[V]], %[[VAR]] : !s32i, !cir.ptr<!s32i>
-  // CHECK: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
-  // CHECK: %[[TMP:.*]] = cir.load %[[VAR]] : !cir.ptr<!s32i>, !s32i
-  // CHECK: cir.store %[[TMP]], %[[ARR_PTR]] : !s32i, !cir.ptr<!s32i>
-  // CHECK: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
-  // CHECK: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[OFFSET]] : !s64i), !cir.ptr<!s32i>
-  // CHECK: %[[V1:.*]] = cir.const #cir.int<5> : !s32i
-  // CHECK: cir.store %[[V1]], %[[ELE_PTR]] : !s32i, !cir.ptr<!s32i>
 }
 
+// CIR: %[[VAR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
+// CIR: %[[V:.*]] = cir.const #cir.int<4> : !s32i
+// CIR: cir.store %[[V]], %[[VAR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[TMP:.*]] = cir.load %[[VAR]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.store %[[TMP]], %[[ARR_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[OFFSET]] : !s64i), !cir.ptr<!s32i>
+// CIR: %[[V1:.*]] = cir.const #cir.int<5> : !s32i
+// CIR: cir.store %[[V1]], %[[ELE_PTR]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: define void @func6()
+// LLVM:  %[[VAR:.*]] = alloca i32, i64 1, align 4
+// LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
+// LLVM:  store i32 4, ptr %[[VAR]], align 4
+// LLVM:  %[[ELE_0:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
+// LLVM:  %[[TMP:.*]] = load i32, ptr %[[VAR]], align 4
+// LLVM:  store i32 %[[TMP]], ptr %[[ELE_0]], align 4
+// LLVM:  %[[ELE_1:.*]] = getelementptr i32, ptr %[[ELE_0]], i64 1
+// LLVM:  store i32 5, ptr %[[ELE_1]], align 4
+
+// OGCG:  %x = alloca i32, align 4
+// OGCG:  %arr = alloca [2 x i32], align 4
+// OGCG:  store i32 4, ptr %x, align 4
+// OGCG:  %0 = load i32, ptr %x, align 4
+// OGCG:  store i32 %0, ptr %arr, align 4
+// OGCG:  %arrayinit.element = getelementptr inbounds i32, ptr %arr, i64 1
+// OGCG:  store i32 5, ptr %arrayinit.element, align 4
+
 void func7() {
   int* arr[1] = {};
-
-  // CHECK: %[[ARR:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 1>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>, ["arr", init]
-  // CHECK: %[[ARR_TMP:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["arrayinit.temp", init]
-  // CHECK: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>), !cir.ptr<!cir.ptr<!s32i>>
-  // CHECK: cir.store %[[ARR_PTR]], %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
-  // CHECK: %[[TMP:.*]] = cir.load %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!s32i>>
-  // CHECK: %[[NULL_PTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
-  // CHECK: cir.store %[[NULL_PTR]], %[[TMP]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-  // CHECK: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
-  // CHECK: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[TMP]] : !cir.ptr<!cir.ptr<!s32i>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.ptr<!s32i>>
-  // CHECK: cir.store %[[ELE_PTR]], %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
 }
 
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 1>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>, ["arr", init]
+// CIR: %[[ARR_TMP:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["arrayinit.temp", init]
+// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>), !cir.ptr<!cir.ptr<!s32i>>
+// CIR: cir.store %[[ARR_PTR]], %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CIR: %[[TMP:.*]] = cir.load %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR: %[[NULL_PTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
+// CIR: cir.store %[[NULL_PTR]], %[[TMP]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[TMP]] : !cir.ptr<!cir.ptr<!s32i>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.ptr<!s32i>>
+// CIR: cir.store %[[ELE_PTR]], %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+
+// LLVM: define void @func7()
+// LLVM:  %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
+// LLVM:  %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8
+// LLVM:  %[[ELE_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
+// LLVM:  store ptr %[[ELE_PTR]], ptr %[[ALLOCA]], align 8
+// LLVM:  %[[TMP:.*]] = load ptr, ptr %[[ALLOCA]], align 8
+// LLVM:  store ptr null, ptr %[[TMP]], align 8
+// LLVM:  %[[ELE:.*]] = getelementptr ptr, ptr %[[TMP]], i64 1
+// LLVM:  store ptr %[[ELE]], ptr %[[ALLOCA]], align 8
+
+// OGCG: %[[ARR:.*]] = alloca [1 x ptr], align 8
+// OGCG: call void @llvm.memset.p0.i64(ptr align 8 %[[ARR]], i8 0, i64 8, i1 false)
+
 void func8(int p[10]) {}
-// CHECK: cir.func @func8(%arg0: !cir.ptr<!s32i>
-// CHECK: cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p", init]
+// CIR: cir.func @func8(%arg0: !cir.ptr<!s32i>
+// CIR: cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p", init]
+
+// LLVM: define void @func8(ptr {{%.*}})
+// LLVM-NEXT: alloca ptr, i64 1, align 8
+
+// OGCG: alloca ptr, align 8
 
 void func9(int pp[10][5]) {}
-// CHECK: cir.func @func9(%arg0: !cir.ptr<!cir.array<!s32i x 5>>
-// CHECK: cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>
+// CIR: cir.func @func9(%arg0: !cir.ptr<!cir.array<!s32i x 5>>
+// CIR: cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>
+
+// LLVM: define void @func9(ptr {{%.*}})
+// LLVM-NEXT: alloca ptr, i64 1, align 8
+
+// OGCG: alloca ptr, align 8
diff --git a/clang/test/CIR/Lowering/array.cpp b/clang/test/CIR/Lowering/array.cpp
index e1c977eb43141..036a7b4f2d613 100644
--- a/clang/test/CIR/Lowering/array.cpp
+++ b/clang/test/CIR/Lowering/array.cpp
@@ -31,12 +31,24 @@ int f[5] = {1, 2};
 
 void func() {
   int arr[10];
+  int e = arr[0];
+  int e2 = arr[1];
 }
 // CHECK: define void @func()
-// CHECK-NEXT: alloca [10 x i32], i64 1, align 16
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = alloca [10 x i32], i64 1, align 16
+// CHECK-NEXT: %[[INIT:.*]] = alloca i32, i64 1, align 4
+// CHECK-NEXT: %[[INIT_2:.*]] = alloca i32, i64 1, align 4
+// CHECK-NEXT: %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
+// CHECK-NEXT: %[[ELE_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 0
+// CHECK-NEXT: %[[TMP:.*]] = load i32, ptr %[[ELE_PTR]], align 4
+// CHECK-NEXT: store i32 %[[TMP]], ptr %[[INIT]], align 4
+// CHECK-NEXT: %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
+// CHECK-NEXT: %[[ELE_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
+// CHECK-NEXT: %[[TMP:.*]] = load i32, ptr %[[ELE_PTR]], align 4
+// CHECK-NEXT: store i32 %[[TMP]], ptr %[[INIT_2]], align 4
 
 void func2() {
-  int arr2[2] = {5};
+  int arr[2] = {5};
 }
 // CHECK: define void @func2()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4
@@ -61,19 +73,27 @@ void func3() {
 // CHECK:  store i32 6, ptr %[[ELE_1_PTR]], align 4
 
 void func4() {
-  int arr4[2][1] = {{5}, {6}};
+  int arr[2][1] = {{5}, {6}};
+  int e = arr[1][0];
 }
 // CHECK: define void @func4()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
-// CHECK:  %[[ARR_0:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
-// CHECK:  %[[ARR_0_ELE_0:.*]] = getelementptr i32, ptr %[[ARR_0]], i32 0
-// CHECK:  store i32 5, ptr %[[ARR_0_ELE_0]], align 4
-// CHECK:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %2, i64 1
-// CHECK:  %[[ARR_0_ELE_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
-// CHECK:  store i32 6, ptr %[[ARR_0_ELE_0]], align 4
+// CHECK:  %[[INIT:.*]] = alloca i32, i64 1, align 4
+// CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
+// CHECK:  %[[ARR_0_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
+// CHECK:  store i32 5, ptr %[[ARR_0_0]], align 4
+// CHECK:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
+// CHECK:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
+// CHECK:  store i32 6, ptr %[[ARR_1_0]], align 4
+// CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
+// CHECK:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
+// CHECK:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
+// CHECK:  %[[ELE_PTR:.*]] = getelementptr i32, ptr %[[ARR_1_0]], i64 0
+// CHECK:  %[[TMP:.*]] = load i32, ptr %[[ELE_PTR]], align 4
+// CHECK:  store i32 %[[TMP]], ptr %[[INIT]], align 4
 
 void func5() {
-  int arr5[2][1] = {{5}};
+  int arr[2][1] = {{5}};
 }
 // CHECK: define void @func5()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4

>From 968a91743ab7fd479d7557de79e6b7235d48ce4f Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 10 Apr 2025 22:13:05 +0000
Subject: [PATCH 38/38] final style changes

---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp         |  4 +-
 .../CodeGen/NVPTX/nvvm-reflect-options.ll     | 55 ++++++++++---------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 9f867e7d4ea4a..84553786e4f75 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -97,7 +97,7 @@ INITIALIZE_PASS(NVVMReflectLegacyPass, "nvvm-reflect",
                 false)
 
 // Allow users to specify additional key/value pairs to reflect. These key/value
-// pairs are the last to be added to the VarMap, and therefore will take
+// pairs are the last to be added to the ReflectMap, and therefore will take
 // precedence over initial values (i.e. __CUDA_FTZ from module medadata and
 // __CUDA_ARCH from SmVersion).
 static cl::list<std::string> ReflectList(
@@ -105,7 +105,7 @@ static cl::list<std::string> ReflectList(
     cl::desc("A key=value pair. Replace __nvvm_reflect(name) with value."),
     cl::ValueRequired);
 
-// Set the VarMap with, first, the value of __CUDA_FTZ from module metadata, and
+// Set the ReflectMap with, first, the value of __CUDA_FTZ from module metadata, and
 // then the key/value pairs from the command line.
 void NVVMReflect::populateReflectMap(Module &M) {
   if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
index 4b203b35d2fe4..0706882236d86 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll
@@ -1,53 +1,54 @@
 ; Test the NVVM reflect pass functionality: verifying that reflect calls are replaced with 
 ; appropriate values based on command-line options. Verify that we can handle custom reflect arguments
-; that aren't __CUDA_ARCH or __CUDA_FTZ. If that argument is given a value on the command-line, the reflect call should be replaced with that value.
-; Otherwise, the reflect call should be replaced with 0.
+; that aren't __CUDA_ARCH or __CUDA_FTZ. If that argument is given a value on the command-line,
+; the reflect call should be replaced with that value. Otherwise, the reflect call should be replaced with 0.
+
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda \
+; RUN:   -nvvm-reflect-add __CUDA_FTZ=1 -nvvm-reflect-add __CUDA_ARCH=350 %s -S \
+; RUN:   | FileCheck %s --check-prefixes=COMMON,FTZ1,ARCH350,CUSTOM-ABSENT
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda \
+; RUN:   -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %s -S \
+; RUN:   | FileCheck %s --check-prefixes=COMMON,FTZ0,ARCH520,CUSTOM-ABSENT
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda \
+; RUN:   -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 \
+; RUN:   -nvvm-reflect-add __CUSTOM_VALUE=42 %s -S \
+; RUN:   | FileCheck %s --check-prefixes=COMMON,CUSTOM-PRESENT
+
+; To ensure that command line options override module options, create a copy of this test file 
+; with module options appended and rerun some tests.
 
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=1 -nvvm-reflect-add __CUDA_ARCH=350 %s -S | FileCheck %s --check-prefix=CHECK-FTZ1 --check-prefix=CHECK-ARCH350 --check-prefix=CHECK-CUSTOM-ABSENT
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %s -S | FileCheck %s --check-prefix=CHECK-FTZ0 --check-prefix=CHECK-ARCH520 --check-prefix=CHECK-CUSTOM-ABSENT
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 -nvvm-reflect-add __CUSTOM_VALUE=42 %s -S | FileCheck %s --check-prefix=CHECK-CUSTOM-PRESENT
-
-; To ensure that command line options override module options, create a copy of this test file with module options appended and rerun some tests.
-;
 ; RUN: cat %s > %t.options
 ; RUN: echo '!llvm.module.flags = !{!0}' >> %t.options
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.options
-; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %t.options -S | FileCheck %s --check-prefix=CHECK-FTZ0 --check-prefix=CHECK-ARCH520
+; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda \
+; RUN:   -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %t.options -S \
+; RUN:   | FileCheck %s --check-prefixes=COMMON,FTZ0,ARCH520
 
 declare i32 @__nvvm_reflect(ptr)
 @ftz = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
 @arch = private unnamed_addr addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
 @custom = private unnamed_addr addrspace(1) constant [15 x i8] c"__CUSTOM_VALUE\00"
 
-; Test handling of __CUDA_FTZ reflect value
+; COMMON-LABEL: define i32 @test_ftz()
+; FTZ1: ret i32 1
+; FTZ0: ret i32 0
 define i32 @test_ftz() {
   %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @ftz to ptr))
   ret i32 %1
 }
 
-; CHECK-FTZ1: define i32 @test_ftz()
-; CHECK-FTZ1: ret i32 1
-; CHECK-FTZ0: define i32 @test_ftz()
-; CHECK-FTZ0: ret i32 0
-
-; Test handling of __CUDA_ARCH reflect value
+; COMMON-LABEL: define i32 @test_arch()
+; ARCH350: ret i32 350
+; ARCH520: ret i32 520
 define i32 @test_arch() {
   %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @arch to ptr))
   ret i32 %1
 }
 
-; CHECK-ARCH350: define i32 @test_arch()
-; CHECK-ARCH350: ret i32 350
-; CHECK-ARCH520: define i32 @test_arch()
-; CHECK-ARCH520: ret i32 520
-
-; Test handling of a custom reflect value that's not built into the pass
+; COMMON-LABEL: define i32 @test_custom()
+; CUSTOM-ABSENT: ret i32 0
+; CUSTOM-PRESENT: ret i32 42
 define i32 @test_custom() {
   %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @custom to ptr))
   ret i32 %1
 }
-
-; CHECK-CUSTOM-ABSENT: define i32 @test_custom()
-; CHECK-CUSTOM-ABSENT: ret i32 0
-; CHECK-CUSTOM-PRESENT: define i32 @test_custom()
-; CHECK-CUSTOM-PRESENT: ret i32 42