[llvm] 20ef8b0 - [AMDGPU] Add `nocreateundeforpoison` annotations (#166450)

Fri Jan 2 10:13:03 PST 2026

Author: Krzysztof Drewniak
Date: 2026-01-02T10:12:58-08:00
New Revision: 20ef8b02855ab3bf71d4be1b2423834e507dfe3b

URL: https://github.com/llvm/llvm-project/commit/20ef8b02855ab3bf71d4be1b2423834e507dfe3b
DIFF: https://github.com/llvm/llvm-project/commit/20ef8b02855ab3bf71d4be1b2423834e507dfe3b.diff

LOG: [AMDGPU] Add `nocreateundeforpoison` annotations (#166450)

This commit goes through IntrinsicsAMDGPU.td and adds
`nocreateundeforpoison` to intrinsics that (to my knowledge) perform
arithmetic operations that are defined everywhere (so no bitfield
extracts and such since those can have invalid inputs, and similarly for permutations).

Added: 
    

Modified: 
    clang/test/CodeGenOpenCL/builtins-amdgcn.cl
    clang/test/Headers/__clang_hip_math.hip
    llvm/include/llvm/IR/Intrinsics.td
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/include/llvm/IR/IntrinsicsNVVM.td
    llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll
    llvm/test/Instrumentation/AddressSanitizer/asan-pass-second-run.ll
    llvm/test/tools/llvm-reduce/remove-attributes-convergent-uncontrolled.ll

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index a5132c9114673..80b585513f71a 100644

--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -948,7 +948,7 @@ void test_read_exec(global ulong* out) {
   *out = __builtin_amdgcn_read_exec();
 }
 
-// CHECK: declare i64 @llvm.amdgcn.ballot.i64(i1){{.*}} #[[$NOUNWIND_READONLY:[0-9]+]]
+// CHECK: declare i64 @llvm.amdgcn.ballot.i64(i1){{.*}} #[[$NOUNWIND_READONLY_NOPOISON:[0-9]+]]
 
 // CHECK-LABEL: @test_read_exec_lo(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ballot.i32(i1 true)
@@ -956,7 +956,7 @@ void test_read_exec_lo(global uint* out) {
   *out = __builtin_amdgcn_read_exec_lo();
 }
 
-// CHECK: declare i32 @llvm.amdgcn.ballot.i32(i1){{.*}} #[[$NOUNWIND_READONLY:[0-9]+]]
+// CHECK: declare i32 @llvm.amdgcn.ballot.i32(i1){{.*}} #[[$NOUNWIND_READONLY_NOPOISON:[0-9]+]]
 
 // CHECK-LABEL: @test_read_exec_hi(
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.ballot.i64(i1 true)
@@ -1299,4 +1299,4 @@ void test_set_fpenv(unsigned long env) {
 
 // CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0}
 // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
-// CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
+// CHECK-DAG: attributes #[[$NOUNWIND_READONLY_NOPOISON]] = { convergent mustprogress nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }

diff  --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 426e5af319cbf..42625ddcf2d31 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -8914,7 +8914,7 @@ extern "C" __device__ float test___sinf(float x) {
   return __sinf(x);
 }
 
-// DEFAULT-LABEL: define dso_local float @test___tanf(
+// DEFAULT-LABEL: define dso_local noundef float @test___tanf(
 // DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
 // DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR15]]
@@ -8932,7 +8932,7 @@ extern "C" __device__ float test___sinf(float x) {
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[CALL_I3_I]], [[TMP0]]
 // FINITEONLY-NEXT:    ret float [[MUL_I]]
 //
-// APPROX-LABEL: define dso_local float @test___tanf(
+// APPROX-LABEL: define dso_local noundef float @test___tanf(
 // APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
 // APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR15]]
@@ -8941,7 +8941,7 @@ extern "C" __device__ float test___sinf(float x) {
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // APPROX-NEXT:    ret float [[MUL_I]]
 //
-// NCRDIV-LABEL: define dso_local float @test___tanf(
+// NCRDIV-LABEL: define dso_local noundef float @test___tanf(
 // NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
 // NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR15]]
@@ -8950,7 +8950,7 @@ extern "C" __device__ float test___sinf(float x) {
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // NCRDIV-NEXT:    ret float [[MUL_I]]
 //
-// AMDGCNSPIRV-LABEL: define spir_func float @test___tanf(
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___tanf(
 // AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[CALL_I3_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR15]]

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 35a4158a56da9..c6ad37d363d63 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -754,6 +754,19 @@ class MSBuiltin<string name> {
   string MSBuiltinName = name;
 }
 
+/// Utility class for intrinsics that
+/// 1. Don't touch memory or any hidden state
+/// 2. Can be freely speculated, and
+/// 3. Will not create undef or poison on defined inputs.
+class PureIntrinsic<list<LLVMType> ret_types,
+                list<LLVMType> param_types = [],
+                list<IntrinsicProperty> intr_properties = [],
+                string name = "",
+                list<SDNodeProperty> sd_properties = []>
+                : DefaultAttrsIntrinsic<ret_types, param_types,
+                            intr_properties # [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison],
+                            name, sd_properties>;
+
 #ifndef TEST_INTRINSICS_SUPPRESS_DEFS
 
 //===--------------- Variable Argument Handling Intrinsics ----------------===//
@@ -1940,8 +1953,7 @@ def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty],
 // Intrinsic to mask out bits of a pointer.
 // First argument must be pointer or vector of pointer. This is checked by the
 // verifier.
-def int_ptrmask: DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_anyint_ty],
-                           [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
+def int_ptrmask: PureIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_anyint_ty]>;
 
 // Intrinsic to wrap a thread local variable.
 def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>],

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2afe89357a991..d951265fcae42 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -397,47 +397,42 @@ def int_amdgcn_s_wait_loadcnt        : AMDGPUWaitIntrinsic;
 def int_amdgcn_s_wait_samplecnt      : AMDGPUWaitIntrinsic;
 def int_amdgcn_s_wait_storecnt       : AMDGPUWaitIntrinsic;
 
-def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
+def int_amdgcn_div_scale : PureIntrinsic<
   // 1st parameter: Numerator
   // 2nd parameter: Denominator
   // 3rd parameter: Select quotient. Must equal Numerator or Denominator.
   //                (0 = Denominator, 1 = Numerator).
   [llvm_anyfloat_ty, llvm_i1_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
-  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
+  [ImmArg<ArgIndex<2>>]
 >;
 
-def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
-  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
-  [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_div_fmas : PureIntrinsic<[llvm_anyfloat_ty],
+  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty]
 >;
 
-def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
-  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable]
->;
+def int_amdgcn_div_fixup : PureIntrinsic<[llvm_anyfloat_ty],
+  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 
 // Look Up 2.0 / pi src0 with segment select src1[4:0]
-def int_amdgcn_trig_preop : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_trig_preop : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty]
 >;
 
-def int_amdgcn_sin : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_sin : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
-def int_amdgcn_cos : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_cos : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
 // v_log_{f16|f32}, performs log2. f32 version does not handle
 // denormals. There is no reason to use this for f16 as it does
 // support denormals, and the generic log2 intrinsic should be
 // preferred.
-def int_amdgcn_log : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_log : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
 // v_exp_{f16|f32} (int_amdgcn_exp was taken by export
@@ -445,17 +440,17 @@ def int_amdgcn_log : DefaultAttrsIntrinsic<
 // denormals. There is no reason to use this for f16 as it does
 // support denormals, and the generic exp2 intrinsic should be
 // preferred.
-def int_amdgcn_exp2 : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_exp2 : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
-def int_amdgcn_log_clamp : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_log_clamp : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
 def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">,
-  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  PureIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+  [Commutative]
 >;
 
 // Fused single-precision multiply-add with legacy behaviour for the multiply,
@@ -464,129 +459,144 @@ def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">,
 // v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and
 // has a completely 
diff erent kind of legacy behaviour.)
 def int_amdgcn_fma_legacy :
-  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  PureIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+  [Commutative]
 >;
 
-def int_amdgcn_rcp : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_rcp : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
 def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">,
-  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty],
-  [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_float_ty], [llvm_float_ty]
 >;
 
-def int_amdgcn_sqrt :  DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_sqrt :  PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
-def int_amdgcn_rsq :  DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_rsq :  PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
 def int_amdgcn_rsq_legacy :  ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
-  DefaultAttrsIntrinsic<
-  [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<
+  [llvm_float_ty], [llvm_float_ty]
 >;
 
 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
-def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>;
+def int_amdgcn_rsq_clamp : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
+>;
 
-def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_frexp_mant : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
-def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic<
-  [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_frexp_exp : PureIntrinsic<
+  [llvm_anyint_ty], [llvm_anyfloat_ty]
 >;
 
 // v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
 // and always uses rtz, so is not suitable for implementing the OpenCL
 // fract function. It should be ok on VI.
-def int_amdgcn_fract : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_fract : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
 >;
 
 def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
-  DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
-            [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]
 >;
 
 def int_amdgcn_cvt_pknorm_i16 :
   ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
-  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
-            [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty]
 >;
 
 def int_amdgcn_cvt_pknorm_u16 :
   ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
-  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
-            [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty]
 >;
 
 def int_amdgcn_cvt_pk_i16 :
     ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
-    DefaultAttrsIntrinsic<
-  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
+    PureIntrinsic<
+  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty]
 >;
 
 def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
-  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty]
 >;
 
-def int_amdgcn_class : DefaultAttrsIntrinsic<
-  [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_class : PureIntrinsic<
+  [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty]
 >;
 
 def int_amdgcn_fmed3 :
-  DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
-    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_anyfloat_ty],
+    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]
 >;
 
 def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">,
-  DefaultAttrsIntrinsic<[llvm_float_ty],
-    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-    [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_float_ty],
+    [llvm_float_ty, llvm_float_ty, llvm_float_ty]
 >;
 
 def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">,
-  DefaultAttrsIntrinsic<[llvm_float_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-  [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_float_ty],
+  [llvm_float_ty, llvm_float_ty, llvm_float_ty]
 >;
 
 def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">,
-  DefaultAttrsIntrinsic<[llvm_float_ty],
-    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-    [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_float_ty],
+    [llvm_float_ty, llvm_float_ty, llvm_float_ty]
 >;
 
 def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">,
-  DefaultAttrsIntrinsic<[llvm_float_ty],
-    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-    [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_float_ty],
+    [llvm_float_ty, llvm_float_ty, llvm_float_ty]
 >;
 
 // v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
 // should be used.
 def int_amdgcn_sffbh :
-  DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]
 >;
 
 // v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support.
 def int_amdgcn_fmad_ftz :
-  DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
-            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]
+  PureIntrinsic<[llvm_anyfloat_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]
 >;
 
+def int_amdgcn_tanh : PureIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>]
+>;
+
+def int_amdgcn_cvt_sr_pk_f16_f32 : PureIntrinsic<
+  [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_pk_f16_f32">;
+
+def int_amdgcn_cvt_sr_pk_bf16_f32 : PureIntrinsic<
+  [llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_pk_bf16_f32">;
+
+def int_amdgcn_cvt_pk_f16_fp8 : PureIntrinsic<
+  [llvm_v2f16_ty], [llvm_i16_ty]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_fp8">;
+
+def int_amdgcn_cvt_pk_f16_bf8 : PureIntrinsic<
+  [llvm_v2f16_ty], [llvm_i16_ty]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_bf8">;
+
+def int_amdgcn_cvt_pk_fp8_f16
+    : PureIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>,
+      ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f16">;
+
+def int_amdgcn_cvt_pk_bf8_f16
+    : PureIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>,
+      ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f16">;
+
 // FIXME: The m0 argument should be moved after the normal arguments
 class AMDGPUDSOrderedIntrinsic : Intrinsic<
   [llvm_i32_ty],
@@ -625,62 +635,34 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_tanh : DefaultAttrsIntrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
->;
-
-def int_amdgcn_cvt_sr_pk_f16_f32 : DefaultAttrsIntrinsic<
-  [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]
->, ClangBuiltin<"__builtin_amdgcn_cvt_sr_pk_f16_f32">;
-
-def int_amdgcn_cvt_sr_pk_bf16_f32 : DefaultAttrsIntrinsic<
-  [llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]
->, ClangBuiltin<"__builtin_amdgcn_cvt_sr_pk_bf16_f32">;
-
-def int_amdgcn_cvt_pk_f16_fp8 : DefaultAttrsIntrinsic<
-  [llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
->, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_fp8">;
-
-def int_amdgcn_cvt_pk_f16_bf8 : DefaultAttrsIntrinsic<
-  [llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
->, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_bf8">;
-
-def int_amdgcn_cvt_pk_fp8_f16
-    : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty],
-                            [IntrNoMem, IntrSpeculatable]>,
-      ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f16">;
-
-def int_amdgcn_cvt_pk_bf8_f16
-    : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty],
-                            [IntrNoMem, IntrSpeculatable]>,
-      ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f16">;
-
 // llvm.amdgcn.cvt.sr.fp8.f16 i32 vdst, half src, i32 seed, i32 old, imm byte_sel [0..3]
 // byte_sel selects byte to write in vdst.
-def int_amdgcn_cvt_sr_fp8_f16 : DefaultAttrsIntrinsic<
+def int_amdgcn_cvt_sr_fp8_f16 : PureIntrinsic<
   [llvm_i32_ty], [llvm_half_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>, Range<ArgIndex<3>, 0, 4>]
+  [ImmArg<ArgIndex<3>>, Range<ArgIndex<3>, 0, 4>]
 >, ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f16">;
 
 // llvm.amdgcn.cvt.sr.bf8.f16 i32 vdst, half src, i32 seed, i32 old, imm byte_sel [0..3]
 // byte_sel selects byte to write in vdst.
-def int_amdgcn_cvt_sr_bf8_f16 : DefaultAttrsIntrinsic<
+def int_amdgcn_cvt_sr_bf8_f16 : PureIntrinsic<
   [llvm_i32_ty], [llvm_half_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>, Range<ArgIndex<3>, 0, 4>]
+  [ImmArg<ArgIndex<3>>, Range<ArgIndex<3>, 0, 4>]
 >, ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f16">;
 
+// Note: these gfx1250 intrinsics are convergent because they read scales from other lanes.
 // llvm.amdgcn.cvt.scale.pk32.f16.bf6 v32f16 vdst, v6i32 src0, i32 scale_sel [0..15]
 class AMDGPUCvtScaleIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
   [DstTy], [Src0Ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>, Range<ArgIndex<2>, 0, 16>]
+  [IntrNoMem, IntrConvergent, IntrWillReturn,IntrNoCreateUndefOrPoison,
+   ImmArg<ArgIndex<2>>, Range<ArgIndex<2>, 0, 16>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
-  [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : PureIntrinsic<
+  [DstTy], [Src0Ty, llvm_float_ty]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
-  [DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : PureIntrinsic<
+  [DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
 def int_amdgcn_cvt_scale_pk8_f16_fp8   : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty,   llvm_v2i32_ty, "cvt_scale_pk8_f16_fp8">;
@@ -699,8 +681,8 @@ def int_amdgcn_cvt_scale_pk16_bf16_fp6 : AMDGPUCvtScaleIntrinsic<llvm_v16bf16_ty
 def int_amdgcn_cvt_scale_pk16_f32_fp6  : AMDGPUCvtScaleIntrinsic<llvm_v16f32_ty,  llvm_v3i32_ty, "cvt_scale_pk16_f32_fp6">;
 def int_amdgcn_cvt_scale_pk16_f32_bf6  : AMDGPUCvtScaleIntrinsic<llvm_v16f32_ty,  llvm_v3i32_ty, "cvt_scale_pk16_f32_bf6">;
 
-class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic<
-  [DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : PureIntrinsic<
+  [DstTy], [Src0Ty, Src1Ty, llvm_float_ty]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
 def int_amdgcn_cvt_scalef32_pk32_fp6_f16  : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty,  "cvt_scalef32_pk32_fp6_f16">;
@@ -748,88 +730,87 @@ def int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm
 def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
 def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
 
-class AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<LLVMType DstTy, string name> : PureIntrinsic<
   [DstTy],
   [llvm_i32_ty,   // src
    llvm_float_ty, // scale
    llvm_i32_ty],  // src_sel index [0..3]
-  [IntrNoMem,
-   ImmArg<ArgIndex<2>>, Range<ArgIndex<2>, 0, 4>]
+  [ImmArg<ArgIndex<2>>, Range<ArgIndex<2>, 0, 4>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<LLVMType DstTy, string name> : PureIntrinsic<
   [DstTy],
   [llvm_i32_ty,   // src
    llvm_float_ty, // scale
    llvm_i1_ty],   // src_lo_hi_sel[true false]
-  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
+  [ImmArg<ArgIndex<2>>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleF16BF16ToFP8BF8TiedInputIntrinsic<LLVMType SrcTy, string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScaleF16BF16ToFP8BF8TiedInputIntrinsic<LLVMType SrcTy, string name> : PureIntrinsic<
   [llvm_v2i16_ty],
   [llvm_v2i16_ty, // old_vdst
    SrcTy,         // src
    llvm_float_ty, // scale
    llvm_i1_ty],   // dst_lo_hi_sel[true false]
-  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+  [ImmArg<ArgIndex<3>>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<string name> : PureIntrinsic<
   [llvm_v2i16_ty],
   [llvm_v2i16_ty, // old_vdst
    llvm_float_ty, // src0
    llvm_float_ty, // src1
    llvm_float_ty, // scale
    llvm_i1_ty],   // dst_lo_hi_sel[true false]
-  [IntrNoMem, ImmArg<ArgIndex<4>>]
+  [ImmArg<ArgIndex<4>>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleFP8BF8ToF16TiedInputIntrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScaleFP8BF8ToF16TiedInputIntrinsic<LLVMType DstTy, string name> :PureIntrinsic<
   [DstTy],
   [llvm_v2f16_ty, // old_vdst
    llvm_i32_ty,   // src
    llvm_float_ty, // scale
    llvm_i32_ty,   // src_sel_index[0..3]
    llvm_i1_ty],   // dst_lo_hi_sel[true false]
-  [IntrNoMem, ImmArg<ArgIndex<3>>, Range<ArgIndex<3>, 0, 4>, ImmArg<ArgIndex<4>>]
+  [ImmArg<ArgIndex<3>>, Range<ArgIndex<3>, 0, 4>, ImmArg<ArgIndex<4>>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleF32ToFP4Intrinsic<string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScaleF32ToFP4Intrinsic<string name> : PureIntrinsic<
   [llvm_i32_ty],
   [llvm_i32_ty,   // old_vdst
    llvm_float_ty, // src0
    llvm_float_ty, // src1
    llvm_float_ty, // scale
    llvm_i32_ty],  // dst_sel_index[0..3]
-  [IntrNoMem, ImmArg<ArgIndex<4>>, Range<ArgIndex<4>, 0, 4>]
+  [ImmArg<ArgIndex<4>>, Range<ArgIndex<4>, 0, 4>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleF16ToFP4TiedInputIntrinsic<LLVMType SrcTy, string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScaleF16ToFP4TiedInputIntrinsic<LLVMType SrcTy, string name> : PureIntrinsic<
   [llvm_i32_ty],
   [llvm_i32_ty,   // old_vdst
    SrcTy,         // src
    llvm_float_ty, // scale
    llvm_i32_ty],  // dest_sel_index [0..3]
-  [IntrNoMem, ImmArg<ArgIndex<3>>, Range<ArgIndex<3>, 0, 4>]
+  [ImmArg<ArgIndex<3>>, Range<ArgIndex<3>, 0, 4>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<LLVMType Src0Ty, string name> : PureIntrinsic<
   [llvm_i32_ty],
   [llvm_i32_ty,   // old_vdst
    Src0Ty,        // src0
    llvm_i32_ty,   // seed
    llvm_float_ty, // scale
    llvm_i32_ty],  // dst_sel_index[0..3]
-  [IntrNoMem, ImmArg<ArgIndex<4>>, Range<ArgIndex<4>, 0, 4>]
+  [ImmArg<ArgIndex<4>>, Range<ArgIndex<4>, 0, 4>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
-class AMDGPUCvtScaleSRF32ToBF16F16TiedInputIntrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
+class AMDGPUCvtScaleSRF32ToBF16F16TiedInputIntrinsic<LLVMType DstTy, string name> : PureIntrinsic<
   [DstTy],
   [DstTy,         // old_vdst
    llvm_float_ty, // src0
    llvm_i32_ty,   // seed
    llvm_i1_ty],   // dst_lo_hi_sel[true false]
-  [IntrNoMem, ImmArg<ArgIndex<3>>]
+  [ImmArg<ArgIndex<3>>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
 def int_amdgcn_cvt_sr_bf16_f32: AMDGPUCvtScaleSRF32ToBF16F16TiedInputIntrinsic<llvm_v2bf16_ty, "cvt_sr_bf16_f32">;
@@ -898,13 +879,13 @@ def int_amdgcn_cvt_scalef32_sr_fp8_f16: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8Tied
 def int_amdgcn_cvt_scalef32_sr_fp8_f32: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_float_ty, "cvt_scalef32_sr_fp8_f32">;
 
 def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
-  [llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]
+  [llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrNoCreateUndefOrPoison]
 >, ClangBuiltin<"__builtin_amdgcn_prng_b32">;
 
 def int_amdgcn_bitop3 :
-  DefaultAttrsIntrinsic<[llvm_anyint_ty],
+  PureIntrinsic<[llvm_anyint_ty],
                         [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                        [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]>;
+                        [ImmArg<ArgIndex<3>>]>;
 
 } // TargetPrefix = "amdgcn"
 
@@ -1435,15 +1416,14 @@ let TargetPrefix = "amdgcn" in {
 // resources, it can create the fat pointers ptr addrspace(7) and ptr addrspace(9),
 // which carry additional offset bits. When this intrinsic is used to create
 // these fat pointers, their offset and index fields (if applicable) are zero.
-def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
+def int_amdgcn_make_buffer_rsrc : PureIntrinsic <
   [llvm_anyptr_ty],
   [llvm_anyptr_ty, // base
    llvm_i16_ty,    // stride (and swizzle control)
    llvm_i64_ty,    // NumRecords / extent
    llvm_i32_ty],   // flags
   // Attributes lifted from ptrmask + some extra argument attributes.
-  [IntrNoMem, ReadNone<ArgIndex<0>>,
-   IntrSpeculatable]>;
+  [ReadNone<ArgIndex<0>>]>;
 
 defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
 
@@ -2364,12 +2344,12 @@ def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty],
 def int_amdgcn_mbcnt_lo :
   ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-   [IntrNoMem]>;
+   [IntrNoMem, IntrNoCreateUndefOrPoison]>;
 
 def int_amdgcn_mbcnt_hi :
   ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem]>;
+            [IntrNoMem, IntrNoCreateUndefOrPoison]>;
 
 // llvm.amdgcn.ds.swizzle src offset
 def int_amdgcn_ds_swizzle :
@@ -2378,69 +2358,47 @@ def int_amdgcn_ds_swizzle :
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree,
              ImmArg<ArgIndex<1>>]>;
 
-def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
-    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem, IntrSpeculatable]
->;
+def int_amdgcn_ubfe : PureIntrinsic<[llvm_anyint_ty],
+    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty]>;
 
-def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
-    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem, IntrSpeculatable]
->;
+def int_amdgcn_sbfe : PureIntrinsic<[llvm_anyint_ty],
+    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty]>;
 
 def int_amdgcn_lerp :
   ClangBuiltin<"__builtin_amdgcn_lerp">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
 def int_amdgcn_sad_u8 :
   ClangBuiltin<"__builtin_amdgcn_sad_u8">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
 def int_amdgcn_msad_u8 :
   ClangBuiltin<"__builtin_amdgcn_msad_u8">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
 def int_amdgcn_sad_hi_u8 :
   ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
 def int_amdgcn_sad_u16 :
   ClangBuiltin<"__builtin_amdgcn_sad_u16">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
 def int_amdgcn_qsad_pk_u16_u8 :
   ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
-  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty]>;
 
 def int_amdgcn_mqsad_pk_u16_u8 :
   ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
-  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty]>;
 
 def int_amdgcn_mqsad_u32_u8 :
   ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
-  DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty]>;
 
 def int_amdgcn_cvt_pk_u8_f32 :
   ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+  PureIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty]>;
 
 def int_amdgcn_icmp :
   Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty],
@@ -2456,7 +2414,7 @@ def int_amdgcn_fcmp :
 // in all active lanes, and zero in all inactive lanes.
 def int_amdgcn_ballot :
   Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
-            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison]>;
 
 // Inverse of ballot: return the bit corresponding to the current lane from the
 // given mask.
@@ -2464,7 +2422,7 @@ def int_amdgcn_ballot :
 // This is only defined for dynamically uniform masks and therefore convergent.
 def int_amdgcn_inverse_ballot :
   Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
-            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison]>;
 
 // Lowers to S_BITREPLICATE_B64_B32.
 // The argument must be uniform; otherwise, the result is undefined.
@@ -2503,7 +2461,7 @@ defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps;
 
 def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
-            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
@@ -2530,24 +2488,20 @@ def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
 
 // mul24 intrinsics can return i32 or i64.
 // When returning i64, they're lowered to a mul24/mulhi24 pair.
-def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
-  [llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_mul_i24 : PureIntrinsic<[llvm_anyint_ty],
+  [llvm_i32_ty, llvm_i32_ty]
 >;
 
-def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
-  [llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_mul_u24 : PureIntrinsic<[llvm_anyint_ty],
+  [llvm_i32_ty, llvm_i32_ty]
 >;
 
-def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
-  [llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_mulhi_i24 : PureIntrinsic<[llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty]
 >;
 
-def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
-  [llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrSpeculatable]
+def int_amdgcn_mulhi_u24 : PureIntrinsic<[llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty]
 >;
 
 // llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
@@ -2686,14 +2640,14 @@ def int_amdgcn_set_inactive_chain_arg :
 
 // Return if the given flat pointer points to a local memory address.
 def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">,
-  DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty],
-  [IntrNoMem, IntrSpeculatable] // FIXME: This should be captures(ret: address)
+  PureIntrinsic<[llvm_i1_ty], [llvm_ptr_ty]
+  // FIXME: This should be captures(ret: address)
 >;
 
 // Return if the given flat pointer points to a prvate memory address.
 def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">,
-  DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty],
-  [IntrNoMem, IntrSpeculatable] // FIXME: This should be captures(ret: address)
+  PureIntrinsic<[llvm_i1_ty], [llvm_ptr_ty]
+  // FIXME: This should be captures(ret: address)
 >;
 
 // A uniform tail call to a function with the `amdgpu_cs_chain` or
@@ -2958,7 +2912,8 @@ class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
       LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
     ],
-    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
+    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree,
+     IntrNoCreateUndefOrPoison]
 >;
 
 class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
@@ -2970,7 +2925,8 @@ class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
       LLVMMatchType<0>, // %C
       llvm_i1_ty,       // %high (op_sel) for GFX11, 0 for GFX12
     ],
-    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree,
+     IntrNoCreateUndefOrPoison]
 >;
 
 class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
@@ -2984,7 +2940,8 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
       LLVMMatchType<0>, // %C
       llvm_i1_ty,       // %clamp
     ],
-    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree,
+     IntrNoCreateUndefOrPoison]
 >;
 
 // WMMA GFX11Only
@@ -3051,7 +3008,7 @@ def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
   Intrinsic<[llvm_i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
-             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree,]>;
 
 // llvm.amdgcn.permlanex16.var <old> <src0> <src1> <fi> <bound_control>
 def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">,
@@ -3077,7 +3034,8 @@ class AMDGPUSWmmacIntrinsicIdxReuse<LLVMType A, LLVMType B, LLVMType CD, LLVMTyp
       llvm_i1_ty,       // matrix_a_reuse
       llvm_i1_ty,       // matrix_b_reuse
     ],
-    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]
+    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCreateUndefOrPoison,
+     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]
 >;
 
 class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
@@ -3089,7 +3047,7 @@ class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Ind
       LLVMMatchType<0>, // %C
       Index             // %Sparsity index for A
     ],
-    [IntrNoMem, IntrConvergent, IntrWillReturn]
+    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCreateUndefOrPoison]
 >;
 
 class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
@@ -3104,7 +3062,7 @@ class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I
       Index,            // %Sparsity index for A
       llvm_i1_ty,       // %clamp
     ],
-    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
+    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCreateUndefOrPoison, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
 >;
 
 defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX12 = {
@@ -3214,7 +3172,7 @@ def int_amdgcn_global_prefetch : ClangBuiltin<"__builtin_amdgcn_global_prefetch"
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_fdot2 :
   ClangBuiltin<"__builtin_amdgcn_fdot2">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_float_ty], // %r
     [
       llvm_v2f16_ty, // %a
@@ -3222,42 +3180,40 @@ def int_amdgcn_fdot2 :
       llvm_float_ty, // %c
       llvm_i1_ty     // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_fdot2_f16_f16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_half_ty],  // %r
     [
       llvm_v2f16_ty, // %a
       llvm_v2f16_ty, // %b
       llvm_half_ty   // %c
-    ],
-    [IntrNoMem, IntrSpeculatable]
+    ]
   >;
 
 // bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_bfloat_ty],   // %r
     [
       llvm_v2bf16_ty, // %a
       llvm_v2bf16_ty, // %b
       llvm_bfloat_ty    // %c
-    ],
-    [IntrNoMem, IntrSpeculatable]
+    ]
   >;
 
 // f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_fdot2_f32_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_float_ty], // %r
     [
       llvm_v2bf16_ty, // %a
@@ -3265,7 +3221,7 @@ def int_amdgcn_fdot2_f32_bf16 :
       llvm_float_ty, // %c
       llvm_i1_ty     // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // f32 %r = llvm.amdgcn.fdot2c.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
@@ -3275,7 +3231,7 @@ def int_amdgcn_fdot2_f32_bf16 :
 
 def int_amdgcn_fdot2c_f32_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2c_f32_bf16">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_float_ty], // %r
     [
       llvm_v2bf16_ty, // %a
@@ -3283,14 +3239,14 @@ def int_amdgcn_fdot2c_f32_bf16 :
       llvm_float_ty, // %c
       llvm_i1_ty     // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_sdot2 :
   ClangBuiltin<"__builtin_amdgcn_sdot2">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_i32_ty], // %r
     [
       llvm_v2i16_ty, // %a
@@ -3298,14 +3254,14 @@ def int_amdgcn_sdot2 :
       llvm_i32_ty,   // %c
       llvm_i1_ty     // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_udot2 :
   ClangBuiltin<"__builtin_amdgcn_udot2">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_i32_ty], // %r
     [
       llvm_v2i16_ty, // %a
@@ -3313,14 +3269,14 @@ def int_amdgcn_udot2 :
       llvm_i32_ty,   // %c
       llvm_i1_ty     // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
 def int_amdgcn_sdot4 :
   ClangBuiltin<"__builtin_amdgcn_sdot4">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_i32_ty], // %r
     [
       llvm_i32_ty, // %a
@@ -3328,14 +3284,14 @@ def int_amdgcn_sdot4 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
 def int_amdgcn_udot4 :
   ClangBuiltin<"__builtin_amdgcn_udot4">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_i32_ty], // %r
     [
       llvm_i32_ty, // %a
@@ -3343,7 +3299,7 @@ def int_amdgcn_udot4 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
@@ -3353,7 +3309,7 @@ def int_amdgcn_udot4 :
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
 def int_amdgcn_sudot4 :
   ClangBuiltin<"__builtin_amdgcn_sudot4">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_i32_ty], // %r
     [
       llvm_i1_ty,  // %a_sign
@@ -3363,8 +3319,7 @@ def int_amdgcn_sudot4 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable,
-     ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
+    [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
   >;
 
 // i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
@@ -3372,7 +3327,7 @@ def int_amdgcn_sudot4 :
 //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
 def int_amdgcn_sdot8 :
   ClangBuiltin<"__builtin_amdgcn_sdot8">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_i32_ty], // %r
     [
       llvm_i32_ty, // %a
@@ -3380,7 +3335,7 @@ def int_amdgcn_sdot8 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
@@ -3388,7 +3343,7 @@ def int_amdgcn_sdot8 :
 //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
 def int_amdgcn_udot8 :
   ClangBuiltin<"__builtin_amdgcn_udot8">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_i32_ty], // %r
     [
       llvm_i32_ty, // %a
@@ -3396,7 +3351,7 @@ def int_amdgcn_udot8 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+    [ImmArg<ArgIndex<3>>]
   >;
 
 // i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
@@ -3407,7 +3362,7 @@ def int_amdgcn_udot8 :
 //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
   def int_amdgcn_sudot8 :
   ClangBuiltin<"__builtin_amdgcn_sudot8">,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_i32_ty], // %r
     [
       llvm_i1_ty,  // %a_sign
@@ -3417,22 +3372,20 @@ def int_amdgcn_udot8 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable,
-     ImmArg<ArgIndex<0>>,  ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
+    [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
   >;
 
 // f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
 class AMDGPU8bitFloatDot4Intrinsic :
   ClangBuiltin<!subst("int", "__builtin", NAME)>,
-  DefaultAttrsIntrinsic<
+  PureIntrinsic<
     [llvm_float_ty], // %r
     [
       llvm_i32_ty,   // %a
       llvm_i32_ty,   // %b
       llvm_float_ty, // %c
-    ],
-    [IntrNoMem, IntrSpeculatable]
+    ]
   >;
 
 def int_amdgcn_dot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
@@ -3450,7 +3403,7 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
   DefaultAttrsIntrinsic<[DestTy],
             [SrcABTy, SrcABTy, DestTy,
              llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem,
+            [IntrConvergent, IntrNoMem, IntrNoCreateUndefOrPoison,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 
@@ -3475,7 +3428,7 @@ class AMDGPUMfmaScaleIntrinsic<LLVMType DestTy> :
              llvm_i32_ty, // op_sel (B matrix scale, 2-bits) // TODO: Make i2?
              llvm_i32_ty  // v_mfma_ld_scale_b32 src1 (B matrix scale)
             ],
-            [IntrConvergent, IntrNoMem,
+            [IntrConvergent, IntrNoMem, IntrNoCreateUndefOrPoison,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
              ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<7>>
              ]>;
@@ -3539,7 +3492,7 @@ class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
   DefaultAttrsIntrinsic<[DestTy],
             [SrcA, SrcB, DestTy, llvm_i32_ty,
              llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem,
+            [IntrConvergent, IntrNoMem, IntrNoCreateUndefOrPoison,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
@@ -3592,59 +3545,58 @@ def int_amdgcn_cvt_f32_fp8_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8_e5m
 // llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
 // word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
 def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,
-  DefaultAttrsIntrinsic<[llvm_v2f32_ty],
+  PureIntrinsic<[llvm_v2f32_ty],
             [llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+            [ImmArg<ArgIndex<1>>]>;
 
 // llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel.
 def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">,
-  DefaultAttrsIntrinsic<[llvm_v2f32_ty],
+  PureIntrinsic<[llvm_v2f32_ty],
             [llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+            [ImmArg<ArgIndex<1>>]>;
 
 // llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
 // word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
 def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty],
+  PureIntrinsic<[llvm_i32_ty],
             [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+            [ImmArg<ArgIndex<3>>]>;
 
 // llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
 def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty],
+  PureIntrinsic<[llvm_i32_ty],
             [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+            [ImmArg<ArgIndex<3>>]>;
 
 // llvm.amdgcn.cvt.pk.fp8.f32.e5m3 int vdst, float srcA, float srcB, int old, imm word_sel
 def int_amdgcn_cvt_pk_fp8_f32_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32_e5m3">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty],
+  PureIntrinsic<[llvm_i32_ty],
             [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+            [ImmArg<ArgIndex<3>>]>;
 
 // llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
 // byte_sel selects byte to write into vdst.
 def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty],
+  PureIntrinsic<[llvm_i32_ty],
             [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+            [ImmArg<ArgIndex<3>>]>;
 
 // llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
 def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty],
+  PureIntrinsic<[llvm_i32_ty],
             [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+            [ImmArg<ArgIndex<3>>]>;
 
 // llvm.amdgcn.cvt.sr.fp8.f32.e5m3 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
 def int_amdgcn_cvt_sr_fp8_f32_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32_e5m3">,
-  DefaultAttrsIntrinsic<[llvm_i32_ty],
+  PureIntrinsic<[llvm_i32_ty],
             [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+            [ImmArg<ArgIndex<3>>]>;
 
 // llvm.amdgcn.cvt.off.fp32.i4 int srcA
 def int_amdgcn_cvt_off_f32_i4: ClangBuiltin<"__builtin_amdgcn_cvt_off_f32_i4">,
-  DefaultAttrsIntrinsic<[llvm_float_ty],
-            [llvm_i32_ty],
-            [IntrNoMem, IntrSpeculatable]>;
+  PureIntrinsic<[llvm_float_ty],
+            [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
 // gfx950 intrinsics
@@ -3691,13 +3643,11 @@ def int_amdgcn_permlane32_swap :
 
 // llvm.amdgcn.ashr_pk_i8_i32 int vdst, int src0, int src1 int src2
 def int_amdgcn_ashr_pk_i8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_i8_i32">,
-  DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, IntrSpeculatable]>;
+  PureIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
 // llvm.amdgcn.ashr_pk_u8_i32 int vdst, int src0, int src1 int src2
 def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
-  DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, IntrSpeculatable]>;
+  PureIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
 // gfx1250 intrinsics
@@ -3745,15 +3695,15 @@ def int_amdgcn_cvt_f16_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f16_fp8">,
 
 // llvm.amdgcn.cvt.f16.bf8 half vdst, int srcA, imm byte_sel [0..3]
 def int_amdgcn_cvt_f16_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f16_bf8">,
-  DefaultAttrsIntrinsic<[llvm_half_ty],
+  PureIntrinsic<[llvm_half_ty],
             [llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+            [ImmArg<ArgIndex<1>>]>;
 
 def int_amdgcn_sat_pk4_i4_i8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_i4_i8">,
-  DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
+  PureIntrinsic<[llvm_i16_ty], [llvm_i32_ty]>;
 
 def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">,
-  DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
+  PureIntrinsic<[llvm_i16_ty], [llvm_i32_ty]>;
 
 // llvm.amdgcn.permlane.bcast <src0> <src1> <src2>
 def int_amdgcn_permlane_bcast : ClangBuiltin<"__builtin_amdgcn_permlane_bcast">,
@@ -3786,20 +3736,17 @@ def int_amdgcn_permlane_idx_gen : ClangBuiltin<"__builtin_amdgcn_permlane_idx_ge
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 def int_amdgcn_perm_pk16_b4_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b4_u4">,
-  DefaultAttrsIntrinsic<[llvm_v2i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty],
-                        [IntrNoMem, IntrSpeculatable]>;
+  PureIntrinsic<[llvm_v2i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty]>;
 
 def int_amdgcn_perm_pk16_b6_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b6_u4">,
-  DefaultAttrsIntrinsic<[llvm_v3i32_ty], [llvm_i32_ty, llvm_i64_ty, llvm_v2i32_ty],
-                        [IntrNoMem, IntrSpeculatable]>;
+  PureIntrinsic<[llvm_v3i32_ty], [llvm_i32_ty, llvm_i64_ty, llvm_v2i32_ty]>;
 
 def int_amdgcn_perm_pk16_b8_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b8_u4">,
-  DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty],
-                        [IntrNoMem, IntrSpeculatable]>;
+  PureIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty]>;
 
 class AMDGPUAddMinMax<LLVMType Ty, string Name> : ClangBuiltin<"__builtin_amdgcn_"#Name>,
-  DefaultAttrsIntrinsic<[Ty], [Ty, Ty, Ty, llvm_i1_ty /* clamp */],
-                        [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]>;
+  PureIntrinsic<[Ty], [Ty, Ty, Ty, llvm_i1_ty /* clamp */],
+                        [ImmArg<ArgIndex<3>>]>;
 
 def int_amdgcn_add_max_i32 : AMDGPUAddMinMax<llvm_i32_ty, "add_max_i32">;
 def int_amdgcn_add_max_u32 : AMDGPUAddMinMax<llvm_i32_ty, "add_max_u32">;
@@ -3883,10 +3830,8 @@ def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback,
 
 // Emit 2.5 ulp, no denormal division. Should only be inserted by
 // pass based on !fpmath metadata.
-def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
-  [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-  [IntrNoMem, IntrSpeculatable]
->;
+def int_amdgcn_fdiv_fast : PureIntrinsic<
+  [llvm_float_ty], [llvm_float_ty, llvm_float_ty]>;
 
 // Async instructions increment ASYNCcnt which is modeled as InaccessibleMem.
 class AMDGPUAsyncClusterLoadLDS : Intrinsic <
@@ -3971,7 +3916,7 @@ class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
       llvm_i1_ty,       // %clamp
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
-     IntrWillReturn, IntrNoCallback, IntrNoFree]
+     IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison]
 >;
 
 class AMDGPUWmmaIntrinsicModsC<LLVMType AB, LLVMType CD> :
@@ -3986,7 +3931,7 @@ class AMDGPUWmmaIntrinsicModsC<LLVMType AB, LLVMType CD> :
       llvm_i1_ty,       // matrix_b_reuse
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>,
-     IntrWillReturn, IntrNoCallback, IntrNoFree]
+     IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison]
 >;
 
 class AMDGPUWmmaIntrinsicF4ModsC<LLVMType A, LLVMType B, LLVMType CD> :
@@ -3998,7 +3943,9 @@ class AMDGPUWmmaIntrinsicF4ModsC<LLVMType A, LLVMType B, LLVMType CD> :
       llvm_i16_ty,      // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
       LLVMMatchType<0>,               // %C
     ],
-    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>,
+     IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison
+     ]
 >;
 
 class AMDGPUWmmaIntrinsicModsAll<LLVMType AB, LLVMType CD> :
@@ -4085,7 +4032,7 @@ class AMDGPUWmmaScaleIntrinsicModsC<LLVMType scale_ty> :
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>,
      ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<9>>, ImmArg<ArgIndex<10>>, ImmArg<ArgIndex<12>>, ImmArg<ArgIndex<13>>,
-     IntrWillReturn, IntrNoCallback, IntrNoFree]
+     IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison]
 >;
 
 class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
@@ -4107,7 +4054,7 @@ class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<7>>,
      ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<10>>, ImmArg<ArgIndex<11>>,
-     IntrWillReturn, IntrNoCallback, IntrNoFree]
+     IntrWillReturn, IntrNoCallback, IntrNoFree, IntrNoCreateUndefOrPoison]
 >;
 
 defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
@@ -4155,7 +4102,8 @@ class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I
       llvm_i1_ty,       // matrix_a_reuse
       llvm_i1_ty,       // matrix_b_reuse
     ],
-    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>]
+    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCreateUndefOrPoison,
+     ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>]
 >;
 
 defset list<Intrinsic> AMDGPUSWMMACIntrinsicsGFX1250 = {

diff  --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 6daad1e7b05ef..bad655c7c4d1c 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 // Guidelines on NVPTX Intrinsic design
 //===----------------------------------------------------------------------===//
-// 
+//
 // The NVPTX intrinsics are used to model instructions in the PTX ISA.
 // While simpler intrinsics can represent certain features effectively,
 // more complex instructions like TMA and MMA are not as straightforward
@@ -36,15 +36,15 @@
 //
 // Below are a set of guidelines that may help in choosing
 // an appropriate design for the complex intrinsics:
-// 
+//
 // 1. Each flag argument represents one set of instruction modifiers.
 //    These flags are compile-time integer constants.
-// 
+//
 // 2. When an intrinsic uses flags, document it with details of the
 //    flag usage in the ``NVPTXUsage.rst`` file.
 // 3. Annotate all flag arguments with ImmArg<ArgIdx<>>.
 // 4. Place the flag arguments at the end of the (actual)argument list.
-// 
+//
 // 5. Use `i1` for boolean flags and `i8` for others. Usually,
 //    the `i8` types represent an `enum` encoding the family of
 //    modifiers.
@@ -60,14 +60,14 @@
 //    first-order information in the intrinsic name while using flags
 //    for supplementary details improves readability.
 //    For example:
-// 
+//
 //    i. For MMA intrinsics, 'dense' vs. 'sparse' is a fundamental feature,
 //    whereas an optional scaling applied to matrices is relatively secondary.
-// 
+//
 //    ii. For TMAs, the mode of copy (e.g., 'Tile' or 'Im2col') is a first-order
 //    information, while features like an optional cache hint tend to be
 //    secondary.
-// 
+//
 // 8. If there are invalid combinations within a set of modifiers, avoid
 //    encoding them as flags, as much as possible. This helps reduce the
 //    need for error handling of unsupported cases in the backend.
@@ -77,13 +77,13 @@
 // 9. Similarly, when there are invalid combinations across a set of
 //    modifiers, avoid encoding them as flags to prevent additional
 //    complexity in error handling.
-// 
+//
 // 10. Maintain a consistent design within an intrinsic family, including
 //     argument ordering as well as the usage and ordering of flags.
 // 11. When designing an intrinsic corresponding to an instruction or its variant,
 //     consider the entire instruction family. This may reveal common features
 //     that can be modelled consistently across the family.
-// 
+//
 // In summary, strive to balance the aspects mentioned above, to achieve
 // a scalable design with maximum readability.
 //===----------------------------------------------------------------------===//
@@ -1266,7 +1266,13 @@ class NVVMBuiltin :
            "NVVMBuiltin must be a NVVM intrinsic starting with 'int_nvvm_'";
 }
 
-class PureIntrinsic<list<LLVMType> ret_types,
+// Note(krzysz00): This class is named `NVVMPureIntrinsic` because the
+// `PureIntrinsic` class I added to `Intrinsics.td` also adds the
+// new `nocreateundeforpoison` property (which means that if the operanands
+// to the intrinsic aren't undef/poison, the result won't be either). I don't know
+// the NVVM intrinsics and so can't update the annotations. Someone from Nvidia
+// should go through an update these (or swap back to `PureIntrinsic` wholesale).
+class NVVMPureIntrinsic<list<LLVMType> ret_types,
                     list<LLVMType> param_types = [],
                     list<IntrinsicProperty> intr_properties = [],
                     string name = ""> :
@@ -1279,18 +1285,18 @@ let TargetPrefix = "nvvm" in {
   // PRMT - permute
   //
   def int_nvvm_prmt : NVVMBuiltin,
-    PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+    NVVMPureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
   foreach mode = ["f4e", "b4e"] in
     def int_nvvm_prmt_ # mode :
-        PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+        NVVMPureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
   // Note: these variants also have 2 source operands but only one will ever
   // be used so we eliminate the other operand in the IR (0 is used as the
   // placeholder in the backend).
   foreach mode = ["rc8", "ecl", "ecr", "rc16"] in
     def int_nvvm_prmt_ # mode :
-        PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;
+        NVVMPureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;
 
   //
   // Nanosleep
@@ -1392,13 +1398,13 @@ let TargetPrefix = "nvvm" in {
   //
   foreach sign = ["", "u"] in {
     def int_nvvm_sad_ # sign # s : NVVMBuiltin,
-        PureIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty]>;
+        NVVMPureIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty]>;
 
     def int_nvvm_sad_ # sign # i : NVVMBuiltin,
-        PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+        NVVMPureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
     def int_nvvm_sad_ # sign # ll : NVVMBuiltin,
-        PureIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty]>;
+        NVVMPureIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty]>;
   }
 
   //
@@ -1407,9 +1413,9 @@ let TargetPrefix = "nvvm" in {
   foreach op = ["floor", "ceil"] in {
     foreach ftz = ["", "_ftz"] in
       def int_nvvm_ # op # ftz # _f : NVVMBuiltin,
-          PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
+          NVVMPureIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
     def int_nvvm_ # op # _d : NVVMBuiltin,
-        PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
+        NVVMPureIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
   }
 
   //
@@ -1417,45 +1423,45 @@ let TargetPrefix = "nvvm" in {
   //
   foreach ftz = ["", "_ftz"] in
     def int_nvvm_fabs # ftz :
-      PureIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+      NVVMPureIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 
   //
   // Neg bf16, bf16x2
   //
   def int_nvvm_neg_bf16 : NVVMBuiltin,
-    PureIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty]>;
+    NVVMPureIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty]>;
   def int_nvvm_neg_bf16x2 : NVVMBuiltin,
-    PureIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty]>;
+    NVVMPureIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty]>;
 
   //
   // Round
   //
   foreach ftz = ["", "_ftz"] in
     def int_nvvm_round # ftz # _f : NVVMBuiltin,
-        PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
+        NVVMPureIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
 
   def int_nvvm_round_d : NVVMBuiltin,
-      PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
+      NVVMPureIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
 
   //
   // Trunc
   //
   foreach ftz = ["", "_ftz"] in
     def int_nvvm_trunc # ftz # _f : NVVMBuiltin,
-        PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
+        NVVMPureIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
 
   def int_nvvm_trunc_d : NVVMBuiltin,
-      PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
+      NVVMPureIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
 
   //
   // Saturate
   //
   foreach ftz = ["", "_ftz"] in
     def int_nvvm_saturate # ftz # _f : NVVMBuiltin,
-        PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
+        NVVMPureIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
 
   def int_nvvm_saturate_d : NVVMBuiltin,
-      PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
+      NVVMPureIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
 
   //
   // Exp2  Log2
@@ -1487,19 +1493,19 @@ let TargetPrefix = "nvvm" in {
   foreach variant = ["", "_sat", "_relu"] in {
     foreach ftz = ["", "_ftz"] in {
       def int_nvvm_fma_rn # ftz # variant # _f16 :
-        PureIntrinsic<[llvm_half_ty],
+        NVVMPureIntrinsic<[llvm_half_ty],
           [llvm_half_ty, llvm_half_ty, llvm_half_ty]>;
 
       def int_nvvm_fma_rn # ftz # variant # _f16x2 :
-        PureIntrinsic<[llvm_v2f16_ty],
+        NVVMPureIntrinsic<[llvm_v2f16_ty],
           [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty]>;
 
       def int_nvvm_fma_rn # ftz # variant # _bf16 : NVVMBuiltin,
-        PureIntrinsic<[llvm_bfloat_ty],
+        NVVMPureIntrinsic<[llvm_bfloat_ty],
           [llvm_bfloat_ty, llvm_bfloat_ty, llvm_bfloat_ty]>;
 
       def int_nvvm_fma_rn # ftz # variant # _bf16x2 : NVVMBuiltin,
-        PureIntrinsic<[llvm_v2bf16_ty],
+        NVVMPureIntrinsic<[llvm_v2bf16_ty],
           [llvm_v2bf16_ty, llvm_v2bf16_ty, llvm_v2bf16_ty]>;
     } // ftz
   } // variant
@@ -1508,7 +1514,7 @@ let TargetPrefix = "nvvm" in {
     foreach ftz = ["", "_ftz"] in {
       foreach sat = ["", "_sat"] in {
         def int_nvvm_fma # rnd # ftz # sat # _f : NVVMBuiltin,
-          PureIntrinsic<[llvm_float_ty],
+          NVVMPureIntrinsic<[llvm_float_ty],
             [llvm_float_ty, llvm_float_ty, llvm_float_ty]>;
       } // sat
     } // ftz
@@ -1591,10 +1597,10 @@ let TargetPrefix = "nvvm" in {
   foreach a_type = ["s", "u"] in {
     foreach b_type = ["s", "u"] in {
       def int_nvvm_idp4a_ # a_type # _ # b_type :
-          PureIntrinsic<[llvm_i32_ty],
+          NVVMPureIntrinsic<[llvm_i32_ty],
               [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
       def int_nvvm_idp2a_ # a_type # _ # b_type :
-          PureIntrinsic<[llvm_i32_ty],
+          NVVMPureIntrinsic<[llvm_i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
             [ImmArg<ArgIndex<2>>]>;
     }
@@ -1605,7 +1611,7 @@ let TargetPrefix = "nvvm" in {
   //
   foreach direction = ["l", "r"] in
     def int_nvvm_fsh # direction # _clamp :
-      PureIntrinsic<[llvm_anyint_ty],
+      NVVMPureIntrinsic<[llvm_anyint_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 
   //
@@ -1613,7 +1619,7 @@ let TargetPrefix = "nvvm" in {
   //
   foreach sign = ["s", "u"] in
     def int_nvvm_flo_ # sign :
-      PureIntrinsic<[llvm_i32_ty], [llvm_anyint_ty, llvm_i1_ty],
+      NVVMPureIntrinsic<[llvm_i32_ty], [llvm_anyint_ty, llvm_i1_ty],
                     [ImmArg<ArgIndex<1>>]>;
 
   //
@@ -1622,94 +1628,94 @@ let TargetPrefix = "nvvm" in {
   foreach ext = ["sext", "zext"] in
     foreach mode = ["wrap", "clamp"] in
       def int_nvvm_ # ext # _ # mode :
-        PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;
+        NVVMPureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;
 
   //
   // BMSK - bit mask
   //
   foreach mode = ["wrap", "clamp"] in
     def int_nvvm_bmsk_ # mode :
-      PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;
+      NVVMPureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;
 
   //
   // FNS - Find the n-th set bit
   //
   def int_nvvm_fns : NVVMBuiltin,
-      PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+      NVVMPureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
 
   //
   // Convert
   //
-  // TODO: All these intrinsics are defined as PureIntrinsic, this attaches the
+  // TODO: All these intrinsics are defined as NVVMPureIntrinsic, this attaches the
   //       IntrSpeculatable property to them. Consider if some of these should
   //       have this attribute removed as they may be too expensive.
   //
   def int_nvvm_lohi_i2d : NVVMBuiltin,
-      PureIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty]>;
+      NVVMPureIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty]>;
 
   def int_nvvm_d2i_lo : NVVMBuiltin,
-      PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>;
+      NVVMPureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>;
   def int_nvvm_d2i_hi : NVVMBuiltin,
-      PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>;
+      NVVMPureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>;
 
   foreach rnd = ["rn", "rz", "rm", "rp"] in {
     foreach ftz = ["", "_ftz"] in
       def int_nvvm_d2f_ # rnd # ftz : NVVMBuiltin,
-          PureIntrinsic<[llvm_float_ty], [llvm_double_ty]>;
+          NVVMPureIntrinsic<[llvm_float_ty], [llvm_double_ty]>;
 
     foreach sign = ["", "u"] in {
 
       def int_nvvm_d2 # sign # i_ # rnd : NVVMBuiltin,
-          PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>;
+          NVVMPureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>;
 
       def int_nvvm_ # sign # i2d_ # rnd : NVVMBuiltin,
-        PureIntrinsic<[llvm_double_ty], [llvm_i32_ty]>;
+        NVVMPureIntrinsic<[llvm_double_ty], [llvm_i32_ty]>;
 
       foreach ftz = ["", "_ftz"] in
         def int_nvvm_f2 # sign # i_ # rnd # ftz : NVVMBuiltin,
-            PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
+            NVVMPureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
 
       def int_nvvm_ # sign # i2f_ # rnd : NVVMBuiltin,
-          PureIntrinsic<[llvm_float_ty], [llvm_i32_ty]>;
+          NVVMPureIntrinsic<[llvm_float_ty], [llvm_i32_ty]>;
 
       foreach ftz = ["", "_ftz"] in
         def int_nvvm_f2 # sign # ll_ # rnd # ftz : NVVMBuiltin,
-            PureIntrinsic<[llvm_i64_ty], [llvm_float_ty]>;
+            NVVMPureIntrinsic<[llvm_i64_ty], [llvm_float_ty]>;
 
       def int_nvvm_d2 # sign # ll_ # rnd : NVVMBuiltin,
-        PureIntrinsic<[llvm_i64_ty], [llvm_double_ty]>;
+        NVVMPureIntrinsic<[llvm_i64_ty], [llvm_double_ty]>;
 
       def int_nvvm_ # sign # ll2f_ # rnd : NVVMBuiltin,
-          PureIntrinsic<[llvm_float_ty], [llvm_i64_ty]>;
+          NVVMPureIntrinsic<[llvm_float_ty], [llvm_i64_ty]>;
 
       def int_nvvm_ # sign # ll2d_ # rnd : NVVMBuiltin,
-          PureIntrinsic<[llvm_double_ty], [llvm_i64_ty]>;
+          NVVMPureIntrinsic<[llvm_double_ty], [llvm_i64_ty]>;
 
     } // sign
   } // rnd
 
   foreach ftz = ["", "_ftz"] in {
     def int_nvvm_f2h_rn # ftz : NVVMBuiltin,
-        PureIntrinsic<[llvm_i16_ty], [llvm_float_ty]>;
+        NVVMPureIntrinsic<[llvm_i16_ty], [llvm_float_ty]>;
 
     def int_nvvm_bf2h_rn # ftz : NVVMBuiltin,
-        PureIntrinsic<[llvm_i16_ty], [llvm_bfloat_ty]>;
+        NVVMPureIntrinsic<[llvm_i16_ty], [llvm_bfloat_ty]>;
   }
 
   foreach rnd = ["rn", "rz"] in {
     foreach relu = ["", "_relu"] in {
       foreach satfinite = ["", "_satfinite"] in {
         def int_nvvm_ff2bf16x2_ # rnd # relu # satfinite : NVVMBuiltin,
-            PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>;
+            NVVMPureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>;
 
         def int_nvvm_ff2f16x2_ # rnd # relu # satfinite : NVVMBuiltin,
-            PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>;
+            NVVMPureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>;
 
         def int_nvvm_f2bf16_ # rnd # relu # satfinite : NVVMBuiltin,
-            PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>;
-            
+            NVVMPureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>;
+
         def int_nvvm_f2f16_ # rnd # relu # satfinite : NVVMBuiltin,
-            PureIntrinsic<[llvm_half_ty], [llvm_float_ty]>;
+            NVVMPureIntrinsic<[llvm_half_ty], [llvm_float_ty]>;
       }
     }
   }
@@ -1719,78 +1725,78 @@ let TargetPrefix = "nvvm" in {
   foreach relu = ["", "_relu"] in {
     foreach satfinite = ["", "_satfinite"] in {
       def int_nvvm_ff2f16x2_rs # relu # satfinite : NVVMBuiltin,
-          PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>;
+          NVVMPureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>;
 
       def int_nvvm_ff2bf16x2_rs # relu # satfinite : NVVMBuiltin,
-          PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>;
+          NVVMPureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>;
     }
   }
 
   foreach satfinite = ["", "_satfinite"] in {
     def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin,
-        PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
+        NVVMPureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
 
     foreach rnd = ["rn", "rz"] in
       foreach relu = ["", "_relu"] in
         def int_nvvm_f2tf32_ # rnd # relu # satfinite : NVVMBuiltin,
-            PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
+            NVVMPureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
   }
 
   foreach type = ["e4m3x2", "e5m2x2"] in {
     foreach relu = ["", "_relu"] in {
       def int_nvvm_ff_to_ # type # _rn # relu : NVVMBuiltin,
-          PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
+          NVVMPureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
 
       def int_nvvm_f16x2_to_ # type # _rn # relu : NVVMBuiltin,
-          PureIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>;
+          NVVMPureIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>;
 
       def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin,
-          PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
+          NVVMPureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
     }
   }
-  
+
   // RS rounding mode (Stochastic Rounding) conversions for f8x4 types
   // The last i32 operand provides the random bits for the conversion
   foreach type = ["e4m3x4", "e5m2x4"] in {
     foreach relu = ["", "_relu"] in {
       def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin,
-          PureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+          NVVMPureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
     }
   }
 
   // FP4 conversions.
   foreach relu = ["", "_relu"] in {
     def int_nvvm_ff_to_e2m1x2_rn # relu # _satfinite : NVVMBuiltin,
-        PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
+        NVVMPureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
 
     def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin,
-        PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
+        NVVMPureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
   }
-  
+
   // RS rounding mode (Stochastic Rounding) conversions for f4x4 type
   // The last i32 operand provides the random bits for the conversion
   foreach relu = ["", "_relu"] in {
     def int_nvvm_f32x4_to_e2m1x4_rs # relu # _satfinite : NVVMBuiltin,
-        PureIntrinsic<[llvm_i16_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+        NVVMPureIntrinsic<[llvm_i16_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
   }
 
   // FP6 conversions.
   foreach type = ["e2m3x2", "e3m2x2"] in {
     foreach relu = ["", "_relu"] in {
       def int_nvvm_ff_to_ # type # _rn # relu # _satfinite : NVVMBuiltin,
-          PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
+          NVVMPureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
 
       def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin,
-          PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
+          NVVMPureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
     }
   }
-  
+
   // RS rounding mode (Stochastic Rounding) conversions for f6x4 types
   // The last i32 operand provides the random bits for the conversion
   foreach type = ["e2m3x4", "e3m2x4"] in {
     foreach relu = ["", "_relu"] in {
       def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin,
-          PureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+          NVVMPureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
     }
   }
 
@@ -1799,16 +1805,16 @@ let TargetPrefix = "nvvm" in {
     foreach satmode = ["", "_satfinite"] in {
       defvar suffix = rmode # satmode;
       def int_nvvm_ff_to_ue8m0x2 # suffix : NVVMBuiltin,
-          PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
+          NVVMPureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
 
       def int_nvvm_bf16x2_to_ue8m0x2 # suffix : NVVMBuiltin,
-          PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;
+          NVVMPureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;
 
     }
   }
 
   def int_nvvm_ue8m0x2_to_bf16x2 : NVVMBuiltin,
-      PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>;
+      NVVMPureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>;
 
   //
   // Atomic operations
@@ -2121,7 +2127,7 @@ let IntrProperties = [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillRetur
 //    space when lowered during ISel.
 //
 def int_nvvm_internal_addrspace_wrap :
-  PureIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
+  NVVMPureIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
                 [NoUndef<ArgIndex<0>>, NoUndef<RetIndex>]>;
 
 // Move intrinsics, used in nvvm internally
@@ -2137,20 +2143,20 @@ let IntrProperties = [IntrNoMem] in {
 
 // For getting the handle from a texture or surface variable
 def int_nvvm_texsurf_handle
-  : PureIntrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyptr_ty]>;
+  : NVVMPureIntrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyptr_ty]>;
 def int_nvvm_texsurf_handle_internal
-  : PureIntrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
+  : NVVMPureIntrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
 
 /// Error / Warn
 def int_nvvm_compiler_error : Intrinsic<[], [llvm_anyptr_ty]>;
 def int_nvvm_compiler_warn : Intrinsic<[], [llvm_anyptr_ty]>;
 
-def int_nvvm_reflect : NVVMBuiltin, PureIntrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
+def int_nvvm_reflect : NVVMBuiltin, NVVMPureIntrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
 
 // isspacep.{const, global, local, shared}
 foreach space = ["const", "global", "local", "shared", "shared_cluster"] in
   def int_nvvm_isspacep_ # space : NVVMBuiltin,
-    PureIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [NoCapture<ArgIndex<0>>]>;
+    NVVMPureIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [NoCapture<ArgIndex<0>>]>;
 
 //
 // Texture Fetch
@@ -2304,7 +2310,7 @@ foreach vec = [TV_I8, TV_I16, TV_I32,
 // Accessing special registers.
 //
 class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = [], string name = "">
-  : PureIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>] # properties, name>;
+  : NVVMPureIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>] # properties, name>;
 
 class PTXReadSRegIntrinsic_r32<list<IntrinsicProperty> properties = []>
   : PTXReadSRegIntrinsicNB_r32<properties>, NVVMBuiltin;
@@ -2758,7 +2764,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] in {
 }
 
 def int_nvvm_is_explicit_cluster
-  : PureIntrinsic<[llvm_i1_ty], [], [NoUndef<RetIndex>],
+  : NVVMPureIntrinsic<[llvm_i1_ty], [], [NoUndef<RetIndex>],
               "llvm.nvvm.is_explicit_cluster">;
 
 // Setmaxnreg inc/dec intrinsics
@@ -2926,7 +2932,7 @@ let IntrProperties = [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<
   }
 
   def int_nvvm_prefetch_tensormap : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>;
-  
+
   foreach eviction_priority = ["evict_normal", "evict_last"] in
     def int_nvvm_prefetch_global_L2_ # eviction_priority : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;
 
@@ -3174,12 +3180,12 @@ def int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared
 // clusterlaunchcontrol.query_cancel.is_canceled
 
 def int_nvvm_clusterlaunchcontrol_query_cancel_is_canceled
-    : PureIntrinsic<[llvm_i1_ty], [llvm_i128_ty], [],
+    : NVVMPureIntrinsic<[llvm_i1_ty], [llvm_i128_ty], [],
                     "llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled">;
 
 foreach dim = ["x", "y", "z"] in
   def int_nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_ # dim
-    : PureIntrinsic<[llvm_i32_ty], [llvm_i128_ty], [],
+    : NVVMPureIntrinsic<[llvm_i32_ty], [llvm_i128_ty], [],
                     "llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid." # dim>;
 
 //
@@ -3203,9 +3209,9 @@ foreach sp = [0, 1] in {
         defvar scale_d_imm_range = [ImmArg<scale_d_imm>, Range<scale_d_imm, 0, 16>];
 
         // Check if this is the specific llvm.nvvm.tcgen05.mma.tensor intrinsic.
-        defvar is_target_intrinsic = !and(!eq(sp, 0), 
-                                          !eq(space, "tensor"), 
-                                          !eq(scale_d, 0), 
+        defvar is_target_intrinsic = !and(!eq(sp, 0),
+                                          !eq(space, "tensor"),
+                                          !eq(scale_d, 0),
                                           !eq(ashift, 0));
 
         defvar base_properties = !listconcat(
@@ -3218,7 +3224,7 @@ foreach sp = [0, 1] in {
           ]
         );
 
-        defvar intrinsic_properties = !if(is_target_intrinsic, 
+        defvar intrinsic_properties = !if(is_target_intrinsic,
           !listconcat(base_properties,
             [ArgInfo<ArgIndex<nargs>, [ArgName<"kind">, ImmArgPrinter<"printTcgen05MMAKind">]>,
              ArgInfo<ArgIndex<!add(nargs, 1)>, [ArgName<"cta_group">]>,

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
index f730199e474f3..4db668e05cb21 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
@@ -1031,7 +1031,7 @@ attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="1" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="2" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="4" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="6" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="5" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
index 2776b9187724c..4e53df3924985 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
@@ -7,12 +7,14 @@
 @lds_3 = external addrspace(3) global [0 x i8], align 4
 @lds_4 = external addrspace(3) global [0 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] {{\[}}[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] {{\[}}[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
@@ -248,7 +250,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR7]] = { nomerge }
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
index f33b30119754f..32601422c7e67 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
@@ -66,7 +66,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP38:%.*]] = and i1 [[TMP34]], [[TMP37]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP38]])
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i64 [[TMP39]], 0
-; CHECK-NEXT:    br i1 [[TMP40]], label [[ASAN_REPORT:%.*]], label [[TMP43:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[ASAN_REPORT:%.*]], label [[TMP43:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       asan.report:
 ; CHECK-NEXT:    br i1 [[TMP38]], label [[TMP41:%.*]], label [[CONDFREE:%.*]]
 ; CHECK:       41:
@@ -103,11 +103,12 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR6]] = { nomerge }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 8, i32 9}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
index 40b1305a3b12c..bad2d8e0fb5f4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
@@ -8,12 +8,14 @@
 @lds_3 = external addrspace(3) global [0 x i8], align 4
 @lds_4 = external addrspace(3) global [0 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] {{\[}}[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] {{\[}}[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
@@ -249,7 +251,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR7]] = { nomerge }
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
index f2cdc4c812db1..c5985e5cc4df8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
@@ -88,7 +88,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP53:%.*]] = and i1 [[TMP49]], [[TMP52]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP53]])
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp ne i64 [[TMP54]], 0
-; CHECK-NEXT:    br i1 [[TMP55]], label [[ASAN_REPORT:%.*]], label [[TMP58:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP55]], label [[ASAN_REPORT:%.*]], label [[TMP58:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       asan.report:
 ; CHECK-NEXT:    br i1 [[TMP53]], label [[TMP56:%.*]], label [[CONDFREE:%.*]]
 ; CHECK:       56:
@@ -114,7 +114,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP71:%.*]] = and i1 [[TMP66]], [[TMP70]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP71]])
 ; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i64 [[TMP72]], 0
-; CHECK-NEXT:    br i1 [[TMP73]], label [[ASAN_REPORT1:%.*]], label [[TMP76:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP73]], label [[ASAN_REPORT1:%.*]], label [[TMP76:%.*]], !prof [[PROF3]]
 ; CHECK:       asan.report1:
 ; CHECK-NEXT:    br i1 [[TMP71]], label [[TMP74:%.*]], label [[TMP75:%.*]]
 ; CHECK:       74:
@@ -139,7 +139,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP88:%.*]] = and i1 [[TMP84]], [[TMP87]]
 ; CHECK-NEXT:    [[TMP89:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP88]])
 ; CHECK-NEXT:    [[TMP90:%.*]] = icmp ne i64 [[TMP89]], 0
-; CHECK-NEXT:    br i1 [[TMP90]], label [[ASAN_REPORT2:%.*]], label [[TMP93:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP90]], label [[ASAN_REPORT2:%.*]], label [[TMP93:%.*]], !prof [[PROF3]]
 ; CHECK:       asan.report2:
 ; CHECK-NEXT:    br i1 [[TMP88]], label [[TMP91:%.*]], label [[TMP92:%.*]]
 ; CHECK:       91:
@@ -164,7 +164,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP105:%.*]] = and i1 [[TMP101]], [[TMP104]]
 ; CHECK-NEXT:    [[TMP106:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP105]])
 ; CHECK-NEXT:    [[TMP107:%.*]] = icmp ne i64 [[TMP106]], 0
-; CHECK-NEXT:    br i1 [[TMP107]], label [[ASAN_REPORT3:%.*]], label [[TMP110:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP107]], label [[ASAN_REPORT3:%.*]], label [[TMP110:%.*]], !prof [[PROF3]]
 ; CHECK:       asan.report3:
 ; CHECK-NEXT:    br i1 [[TMP105]], label [[TMP108:%.*]], label [[TMP109:%.*]]
 ; CHECK:       108:
@@ -203,11 +203,12 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR6]] = { nomerge }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 8, i32 9}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
index b9b4c90daea87..fa888a35cb8ba 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
@@ -7,11 +7,13 @@
 @lds_3 = external addrspace(3) global [3 x i8], align 4
 @lds_4 = external addrspace(3) global [4 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] {{\[}}[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] {{\[}}[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
@@ -217,7 +219,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR7]] = { nomerge }
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
index a70db2259cc3f..a521d9d9d436b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
@@ -6,10 +6,11 @@
 
 @lds_var = internal addrspace(3) global [1024 x i32] poison, align 4
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.my_kernel = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.my_kernel.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.my_kernel.md.type { %llvm.amdgcn.sw.lds.my_kernel.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.my_kernel.md.item { i32 32, i32 4096, i32 5120 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel], no_sanitize_address
 ;.
 define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 ; CHECK-LABEL: define void @my_function(
@@ -33,7 +34,7 @@ define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[TMP12]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP17]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP18]], 0
-; CHECK-NEXT:    br i1 [[TMP19]], label [[ASAN_REPORT:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[ASAN_REPORT:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       asan.report:
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP20:%.*]], label [[TMP21:%.*]]
 ; CHECK:       20:
@@ -60,7 +61,7 @@ define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP36:%.*]] = and i1 [[TMP31]], [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP36]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne i64 [[TMP37]], 0
-; CHECK-NEXT:    br i1 [[TMP38]], label [[ASAN_REPORT1:%.*]], label [[TMP41:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[ASAN_REPORT1:%.*]], label [[TMP41:%.*]], !prof [[PROF2]]
 ; CHECK:       asan.report1:
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP40:%.*]]
 ; CHECK:       39:
@@ -81,7 +82,7 @@ define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 
 define amdgpu_kernel void @my_kernel() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @my_kernel(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
 ; CHECK-NEXT:  WId:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -142,11 +143,12 @@ define amdgpu_kernel void @my_kernel() sanitize_address {
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR7]] = { nomerge }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
-; CHECK: [[META2]] = !{i32 0}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META3]] = !{i32 0}
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll
index 73ffcdd783ded..ab3300ea659b8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll
@@ -150,7 +150,7 @@ attributes #1 = { "amdgpu-no-heap-ptr" }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR6]] = { nomerge }
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll
index 301bda7e0086e..c7550dd9576ec 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll
@@ -149,7 +149,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR6]] = { nomerge }
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll
index 02a241f947748..15b074c2d9c11 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll
@@ -122,7 +122,7 @@ define amdgpu_kernel void @atomic_xchg_kernel(ptr addrspace(1) %out, [8 x i32],
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR6]] = { nomerge }
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll
index b87b3fd824dd3..1b3664bf1e4e7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll
@@ -204,7 +204,7 @@ define amdgpu_kernel void @atomicrmw_kernel(ptr addrspace(1) %arg0) sanitize_add
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
 ; CHECK: attributes #[[ATTR6]] = { nomerge }
 ;.

diff  --git a/llvm/test/Instrumentation/AddressSanitizer/asan-pass-second-run.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-pass-second-run.ll
index 60ef1079624d7..218a0dfa9b060 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-pass-second-run.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-pass-second-run.ll
@@ -8,9 +8,11 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Function with sanitize_address is instrumented.
 ; Function Attrs: nounwind uwtable
 ;.
+; CHECK: @llvm.used = appending global [1 x ptr] [ptr @asan.module_ctor], section "llvm.metadata"
 ; CHECK: @___asan_globals_registered = common hidden global i64 0
 ; CHECK: @__start_asan_globals = extern_weak hidden global i64
 ; CHECK: @__stop_asan_globals = extern_weak hidden global i64
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @asan.module_ctor, ptr @asan.module_ctor }]
 ;.
 define void @instr_sa(ptr %a) sanitize_address {
 ; CHECK: Function Attrs: sanitize_address
@@ -31,7 +33,7 @@ define void @instr_sa(ptr %a) sanitize_address {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp sge i8 [[TMP9]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB12]]
 ; CHECK:       [[BB11]]:
-; CHECK-NEXT:    call void @__asan_report_load4(i64 [[TMP0]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    call void @__asan_report_load4(i64 [[TMP0]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB12]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
@@ -47,8 +49,9 @@ entry:
 }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { sanitize_address }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #[[ATTR2]] = { nomerge }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR3]] = { nomerge }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}

diff  --git a/llvm/test/tools/llvm-reduce/remove-attributes-convergent-uncontrolled.ll b/llvm/test/tools/llvm-reduce/remove-attributes-convergent-uncontrolled.ll
index d408f949db824..64e1588d043ae 100644
--- a/llvm/test/tools/llvm-reduce/remove-attributes-convergent-uncontrolled.ll
+++ b/llvm/test/tools/llvm-reduce/remove-attributes-convergent-uncontrolled.ll
@@ -24,8 +24,8 @@ declare float @convergent.extern.func(float, float) #0
 declare float @extern.func(float, float)
 declare float @llvm.amdgcn.readfirstlane.f32(float) #1
 
-; RESULT: attributes #0 = { convergent nocallback nofree nounwind willreturn memory(none) }
+; RESULT: attributes #0 = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }
 ; RESULT-NOT: attributes
 
 attributes #0 = { convergent nounwind }
-attributes #1 = { convergent nocallback nofree nounwind willreturn memory(none) }
+attributes #1 = { convergent nocallback nocreateundeforpoison nofree nounwind willreturn memory(none) }