[llvm] [AMDGPU] Allow forming overflow op and folding abd to usubo if it is legal. (PR #156266)

Sun Aug 31 17:24:06 PDT 2025

https://github.com/AZero13 created https://github.com/llvm/llvm-project/pull/156266

Because usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible.

>From bc9731755cc77433b989249e4cd10675c78689ce Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Sun, 31 Aug 2025 20:23:15 -0400
Subject: [PATCH] [AMDGPU] Allow forming overflow op and folding abd to usubo
 if it is legal.

Because usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   4 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   3 +-
 .../AMDGPU/addrspacecast-constantexpr.ll      |   4 +-
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       |  27 +-
 .../AMDGPU/amdgpu-codegenprepare-sqrt.ll      |  54 +--
 .../AMDGPU/amdgpu-simplify-libcall-ceil.ll    |   4 +-
 .../AMDGPU/amdgpu-simplify-libcall-exp.ll     |  42 +-
 .../AMDGPU/amdgpu-simplify-libcall-exp2.ll    |  42 +-
 .../AMDGPU/amdgpu-simplify-libcall-fabs.ll    |   4 +-
 .../AMDGPU/amdgpu-simplify-libcall-floor.ll   |   4 +-
 .../AMDGPU/amdgpu-simplify-libcall-log.ll     |  42 +-
 .../AMDGPU/amdgpu-simplify-libcall-log10.ll   |  42 +-
 .../AMDGPU/amdgpu-simplify-libcall-log2.ll    |  42 +-
 .../AMDGPU/amdgpu-simplify-libcall-rint.ll    |   4 +-
 .../AMDGPU/amdgpu-simplify-libcall-round.ll   |   4 +-
 .../AMDGPU/amdgpu-simplify-libcall-sincos.ll  |   6 +-
 .../AMDGPU/amdgpu-simplify-libcall-trunc.ll   |   4 +-
 ...-lower-lds-dynamic-indirect-access-asan.ll |   6 +-
 ...pu-sw-lower-lds-dynamic-indirect-access.ll |  11 +-
 ...dgpu-sw-lower-lds-dynamic-lds-test-asan.ll |   5 +-
 .../amdgpu-sw-lower-lds-dynamic-lds-test.ll   |   1 +
 .../AMDGPU/amdgpu-sw-lower-lds-lower-all.ll   |  16 +
 ...ds-multi-static-dynamic-indirect-access.ll |   8 +-
 ...w-lower-lds-multiple-blocks-return-asan.ll |   1 +
 ...gpu-sw-lower-lds-multiple-blocks-return.ll |   1 +
 ...gpu-sw-lower-lds-non-kernel-declaration.ll |   6 +-
 ...lds-static-dynamic-indirect-access-asan.ll |   6 +-
 ...ower-lds-static-dynamic-indirect-access.ll |  11 +-
 ...-lower-lds-static-dynamic-lds-test-asan.ll |  11 +-
 ...pu-sw-lower-lds-static-dynamic-lds-test.ll |   1 +
 ...w-lower-lds-static-indirect-access-asan.ll |   6 +-
 ...tic-indirect-access-function-param-asan.ll |  14 +-
 ...s-static-indirect-access-function-param.ll |   5 +-
 ...er-lds-static-indirect-access-lower-all.ll |  12 +-
 ...r-lds-static-indirect-access-lower-none.ll |   2 +-
 ...-lds-static-indirect-access-nested-asan.ll |  26 +-
 ...lower-lds-static-indirect-access-nested.ll |  26 +-
 ...static-indirect-access-no-kernel-lds-id.ll |   6 +-
 ...gpu-sw-lower-lds-static-indirect-access.ll |   6 +-
 .../amdgpu-sw-lower-lds-static-lds-test.ll    |   1 +
 .../expand-scalar-carry-out-select-user.ll    |  35 +-
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll |  72 +--
 .../AMDGPU/global_atomics_i32_system.ll       |  72 +--
 .../CodeGen/AMDGPU/private-memory-atomics.ll  |   3 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      | 459 +++++++++---------
 llvm/test/CodeGen/AMDGPU/sad.ll               |  26 +-
 46 files changed, 610 insertions(+), 577 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 438b6ff55c85f..1d8cea1a14c03 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3455,6 +3455,10 @@ class LLVM_ABI TargetLoweringBase {
   /// matching of other patterns.
   virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
                                     bool MathUsed) const {
+    // Form it if it is legal.
+    if (isOperationLegal(Opcode, VT))
+      return true;
+
     // TODO: The default logic is inherited from code in CodeGenPrepare.
     // The opcode should not make a difference by default?
     if (Opcode != ISD::UADDO)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a8c7c16e2fa22..d13011019f3d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9788,7 +9788,8 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
   // flag if the (scalar) type is illegal as this is more likely to legalize
   // cleanly:
   // abdu(lhs, rhs) -> sub(xor(sub(lhs, rhs), uof(lhs, rhs)), uof(lhs, rhs))
-  if (!IsSigned && VT.isScalarInteger() && !isTypeLegal(VT)) {
+  if (!IsSigned && (isOperationLegal(ISD::USUBO, VT) ||
+                    (VT.isScalarInteger() && !isTypeLegal(VT)))) {
     SDValue USubO =
         DAG.getNode(ISD::USUBO, dl, DAG.getVTList(VT, MVT::i1), {LHS, RHS});
     SDValue Cmp = DAG.getNode(ISD::SIGN_EXTEND, dl, VT, USubO.getValue(1));
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 42c7b90da63d3..28f55511ebb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -169,6 +169,6 @@ attributes #1 = { nounwind }
 
 ;.
 ; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 7e9cb7adf4fc2..181dab8d4ca79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -105,7 +105,7 @@ declare void @unknown()
 
 define amdgpu_kernel void @kernel_calls_extern() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
-; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    ret void
 ;
@@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
 
 define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
-; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK-NEXT:    call void @unknown() #[[ATTR6:[0-9]+]]
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT:    call void @unknown() #[[ATTR7:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   call void @unknown() #0
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
 
 define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void [[INDIRECT]]()
 ; CHECK-NEXT:    ret void
 ;
@@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
 
 define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR6]]
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR7]]
 ; CHECK-NEXT:    ret void
 ;
   call void %indirect() #0
@@ -254,11 +254,12 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 
 attributes #0 = { "amdgpu-agpr-alloc"="0" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
index 03c84d1193609..f8a38572c1544 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
@@ -6,7 +6,7 @@
 define amdgpu_kernel void @noop_sqrt_fpmath(ptr addrspace(1) %out, float %x) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @noop_sqrt_fpmath
 ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META0:![0-9]+]]
 ; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -20,9 +20,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
 ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; IEEE-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; IEEE-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
 ; IEEE-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2:![0-9]+]]
 ; IEEE-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
 ; IEEE-NEXT:    [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
 ; IEEE-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -51,7 +51,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; DAZ-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
 ; DAZ-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
 ; DAZ-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -88,9 +88,9 @@ define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float>
 ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
 ; IEEE-NEXT:    store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META1]]
 ; IEEE-NEXT:    store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2
+; IEEE-NEXT:    [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META2]]
 ; IEEE-NEXT:    store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
 ; IEEE-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
 ; IEEE-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
@@ -149,7 +149,7 @@ define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float>
 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
 ; DAZ-NEXT:    [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
 ; DAZ-NEXT:    store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META1]]
 ; DAZ-NEXT:    store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
 ; DAZ-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
 ; DAZ-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
@@ -206,7 +206,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub(ptr addrspace(1) %out, fl
 ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
 ; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
 ; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -243,9 +243,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out,
 ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; IEEE-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; IEEE-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
 ; IEEE-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
 ; IEEE-NEXT:    [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
 ; IEEE-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -274,7 +274,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out,
 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
 ; DAZ-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; DAZ-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
 ; DAZ-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -311,9 +311,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1)
 ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; IEEE-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; IEEE-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
 ; IEEE-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
 ; IEEE-NEXT:    [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
 ; IEEE-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -342,7 +342,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1)
 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
 ; DAZ-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; DAZ-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
 ; DAZ-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -379,9 +379,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrsp
 ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; IEEE-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; IEEE-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
 ; IEEE-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
 ; IEEE-NEXT:    [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
 ; IEEE-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -410,7 +410,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrsp
 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
 ; DAZ-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; DAZ-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
 ; DAZ-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -447,9 +447,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub(ptr addrspace(1) %out, f
 ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(psub) [[X:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; IEEE-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; IEEE-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
 ; IEEE-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
 ; IEEE-NEXT:    [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
 ; IEEE-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -478,7 +478,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub(ptr addrspace(1) %out, f
 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(psub) [[X:%.*]]) #[[ATTR1]] {
 ; DAZ-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; DAZ-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
 ; DAZ-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -515,15 +515,15 @@ define amdgpu_kernel void @sqrt_fpmath_f32_afn(ptr addrspace(1) %out, float %x)
 ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[NO_MD:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
 ; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[MD_1ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2:![0-9]+]]
 ; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[MD_25ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath [[META3:![0-9]+]]
 ; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[MD_3ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath [[META0]]
 ; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[MD_2ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath [[META4:![0-9]+]]
 ; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -555,7 +555,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_assume_nosub(ptr addrspace(1) %out, f
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]])
 ; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
 ; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
 ; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
 ; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -565,7 +565,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_assume_nosub(ptr addrspace(1) %out, f
 ; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    [[MD_2ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
 ; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[MD_3ULP_AFN:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    [[MD_3ULP_AFN:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath [[META0]]
 ; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    [[NO_MD_AFN:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
 ; CHECK-NEXT:    store volatile float [[NO_MD_AFN]], ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ceil.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ceil.ll
index f5b8c805ad88c..b9d6bbc3b09b2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ceil.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ceil.ll
@@ -266,7 +266,7 @@ define <2 x float> @test_ceil_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_ceil_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_ceil_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[CEIL:%.*]] = tail call nnan ninf float @llvm.ceil.f32(float [[ARG]]), !foo !0
+; CHECK-NEXT:    [[CEIL:%.*]] = tail call nnan ninf float @llvm.ceil.f32(float [[ARG]]), !foo [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[CEIL]]
 ;
   %ceil = tail call nnan ninf float @_Z4ceilf(float %arg), !foo !0
@@ -276,7 +276,7 @@ define float @test_ceil_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_ceil_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_ceil_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[CEIL:%.*]] = tail call nnan nsz contract <2 x float> @llvm.ceil.v2f32(<2 x float> [[ARG]]), !foo !0
+; CHECK-NEXT:    [[CEIL:%.*]] = tail call nnan nsz contract <2 x float> @llvm.ceil.v2f32(<2 x float> [[ARG]]), !foo [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[CEIL]]
 ;
   %ceil = tail call contract nsz nnan <2 x float> @_Z4ceilDv2_f(<2 x float> %arg), !foo !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp.ll
index e06e40405b68a..794133bb4ab38 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp.ll
@@ -25,7 +25,7 @@ declare <16 x half> @_Z3expDv16_Dh(<16 x half>)
 define float @test_exp_f32(float %arg) {
 ; CHECK-LABEL: define float @test_exp_f32
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call float @_Z3expf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call float @_Z3expf(float [[ARG]]), !fpmath [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call float @_Z3expf(float %arg), !fpmath !0
@@ -35,7 +35,7 @@ define float @test_exp_f32(float %arg) {
 define <2 x float> @test_exp_v2f32(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_exp_v2f32
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call <2 x float> @_Z3expDv2_f(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call <2 x float> @_Z3expDv2_f(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP]]
 ;
   %exp = tail call <2 x float> @_Z3expDv2_f(<2 x float> %arg), !fpmath !0
@@ -45,7 +45,7 @@ define <2 x float> @test_exp_v2f32(<2 x float> %arg) {
 define <3 x float> @test_exp_v3f32(<3 x float> %arg) {
 ; CHECK-LABEL: define <3 x float> @test_exp_v3f32
 ; CHECK-SAME: (<3 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call <3 x float> @_Z3expDv3_f(<3 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call <3 x float> @_Z3expDv3_f(<3 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <3 x float> [[EXP]]
 ;
   %exp = tail call <3 x float> @_Z3expDv3_f(<3 x float> %arg), !fpmath !0
@@ -55,7 +55,7 @@ define <3 x float> @test_exp_v3f32(<3 x float> %arg) {
 define <4 x float> @test_exp_v4f32(<4 x float> %arg) {
 ; CHECK-LABEL: define <4 x float> @test_exp_v4f32
 ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call <4 x float> @_Z3expDv4_f(<4 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call <4 x float> @_Z3expDv4_f(<4 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <4 x float> [[EXP]]
 ;
   %exp = tail call <4 x float> @_Z3expDv4_f(<4 x float> %arg), !fpmath !0
@@ -65,7 +65,7 @@ define <4 x float> @test_exp_v4f32(<4 x float> %arg) {
 define <8 x float> @test_exp_v8f32(<8 x float> %arg) {
 ; CHECK-LABEL: define <8 x float> @test_exp_v8f32
 ; CHECK-SAME: (<8 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call <8 x float> @_Z3expDv8_f(<8 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call <8 x float> @_Z3expDv8_f(<8 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <8 x float> [[EXP]]
 ;
   %exp = tail call <8 x float> @_Z3expDv8_f(<8 x float> %arg), !fpmath !0
@@ -75,7 +75,7 @@ define <8 x float> @test_exp_v8f32(<8 x float> %arg) {
 define <16 x float> @test_exp_v16f32(<16 x float> %arg) {
 ; CHECK-LABEL: define <16 x float> @test_exp_v16f32
 ; CHECK-SAME: (<16 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call <16 x float> @_Z3expDv16_f(<16 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call <16 x float> @_Z3expDv16_f(<16 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <16 x float> [[EXP]]
 ;
   %exp = tail call <16 x float> @_Z3expDv16_f(<16 x float> %arg), !fpmath !0
@@ -275,7 +275,7 @@ define <16 x half> @test_exp_v16f16(<16 x half> %arg) {
 define float @test_exp_f32_nobuiltin_callsite(float %arg) {
 ; CHECK-LABEL: define float @test_exp_f32_nobuiltin_callsite
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call float @_Z3expf(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call float @_Z3expf(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call float @_Z3expf(float %arg) #0, !fpmath !0
@@ -285,7 +285,7 @@ define float @test_exp_f32_nobuiltin_callsite(float %arg) {
 define <2 x float> @test_exp_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_exp_v2f32_nobuiltin_callsite
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call <2 x float> @_Z3expDv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call <2 x float> @_Z3expDv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP]]
 ;
   %exp = tail call <2 x float> @_Z3expDv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -316,7 +316,7 @@ define <2 x float> @test_exp_cr_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 define float @test_exp_f32_nobuiltins(float %arg) #1 {
 ; CHECK-LABEL: define float @test_exp_f32_nobuiltins
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call float @_Z3expf(float [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call float @_Z3expf(float [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call float @_Z3expf(float %arg) #0, !fpmath !0
@@ -326,7 +326,7 @@ define float @test_exp_f32_nobuiltins(float %arg) #1 {
 define <2 x float> @test_exp_v2f32_nobuiltins(<2 x float> %arg) #1 {
 ; CHECK-LABEL: define <2 x float> @test_exp_v2f32_nobuiltins
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call <2 x float> @_Z3expDv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call <2 x float> @_Z3expDv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP]]
 ;
   %exp = tail call <2 x float> @_Z3expDv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -356,7 +356,7 @@ define <2 x float> @test_exp_cr_v2f32_nobuiltins(<2 x float> %arg) #1 {
 define float @test_exp_f32_preserve_flags(float %arg) {
 ; CHECK-LABEL: define float @test_exp_f32_preserve_flags
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call nnan ninf float @llvm.exp.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call nnan ninf float @llvm.exp.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call nnan ninf float @_Z3expf(float %arg), !fpmath !0
@@ -366,7 +366,7 @@ define float @test_exp_f32_preserve_flags(float %arg) {
 define <2 x float> @test_exp_v2f32_preserve_flags(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_exp_v2f32_preserve_flags
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call nnan nsz contract <2 x float> @llvm.exp.v2f32(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call nnan nsz contract <2 x float> @llvm.exp.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP]]
 ;
   %exp = tail call contract nsz nnan <2 x float> @_Z3expDv2_f(<2 x float> %arg), !fpmath !0
@@ -376,7 +376,7 @@ define <2 x float> @test_exp_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_exp_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_exp_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call nnan ninf float @llvm.exp.f32(float [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[EXP:%.*]] = tail call nnan ninf float @llvm.exp.f32(float [[ARG]]), !fpmath [[META0]], !foo [[META1:![0-9]+]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call nnan ninf float @_Z3expf(float %arg), !fpmath !0, !foo !1
@@ -386,7 +386,7 @@ define float @test_exp_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_exp_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_exp_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call nnan nsz contract <2 x float> @llvm.exp.v2f32(<2 x float> [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[EXP:%.*]] = tail call nnan nsz contract <2 x float> @llvm.exp.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]], !foo [[META1]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP]]
 ;
   %exp = tail call contract nsz nnan <2 x float> @_Z3expDv2_f(<2 x float> %arg), !fpmath !0, !foo !1
@@ -440,7 +440,7 @@ define float @test_libm_exp_f32_fast(float %arg) {
 define float @test_libm_exp_f32_fpmath(float %arg) {
 ; CHECK-LABEL: define float @test_libm_exp_f32_fpmath
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call float @expf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call float @expf(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call float @expf(float %arg), !fpmath !0
@@ -470,7 +470,7 @@ define double @test_libm_exp_f64_fast(double %arg) {
 define double @test_libm_exp_f64_fpmath(double %arg) {
 ; CHECK-LABEL: define double @test_libm_exp_f64_fpmath
 ; CHECK-SAME: (double [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call double @exp(double [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call double @exp(double [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret double [[EXP]]
 ;
   %exp = tail call double @exp(double %arg), !fpmath !0
@@ -480,7 +480,7 @@ define double @test_libm_exp_f64_fpmath(double %arg) {
 define float @test_exp_f32_fast_noinline(float %arg) {
 ; CHECK-LABEL: define float @test_exp_f32_fast_noinline
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call fast float @_Z3expf(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call fast float @_Z3expf(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call fast float @_Z3expf(float %arg) #3, !fpmath !0
@@ -490,7 +490,7 @@ define float @test_exp_f32_fast_noinline(float %arg) {
 define float @test_exp_f32_fast_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_exp_f32_fast_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call fast float @llvm.exp.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call fast float @llvm.exp.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call fast float @_Z3expf(float %arg), !fpmath !0
@@ -500,7 +500,7 @@ define float @test_exp_f32_fast_optsize(float %arg) #4 {
 define float @test_exp_f32_fast_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_exp_f32_fast_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call fast float @llvm.exp.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call fast float @llvm.exp.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call fast float @_Z3expf(float %arg), !fpmath !0
@@ -510,7 +510,7 @@ define float @test_exp_f32_fast_minsize(float %arg) #5 {
 define float @test_exp_f32_nsz_contract_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_exp_f32_nsz_contract_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call nsz contract float @llvm.exp.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call nsz contract float @llvm.exp.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call nsz contract float @_Z3expf(float %arg), !fpmath !0
@@ -520,7 +520,7 @@ define float @test_exp_f32_nsz_contract_optsize(float %arg) #4 {
 define float @test_exp_f32_nsz_contract_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_exp_f32_nsz_contract_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:    [[EXP:%.*]] = tail call nsz contract float @_Z3expf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP:%.*]] = tail call nsz contract float @_Z3expf(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP]]
 ;
   %exp = tail call nsz contract float @_Z3expf(float %arg), !fpmath !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp2.ll
index e3c54cc181e5b..efe8466f2f689 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp2.ll
@@ -25,7 +25,7 @@ declare <16 x half> @_Z4exp2Dv16_Dh(<16 x half>)
 define float @test_exp2_f32(float %arg) {
 ; CHECK-LABEL: define float @test_exp2_f32
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @_Z4exp2f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @_Z4exp2f(float [[ARG]]), !fpmath [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call float @_Z4exp2f(float %arg), !fpmath !0
@@ -35,7 +35,7 @@ define float @test_exp2_f32(float %arg) {
 define <2 x float> @test_exp2_v2f32(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_exp2_v2f32
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
   %exp2 = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> %arg), !fpmath !0
@@ -45,7 +45,7 @@ define <2 x float> @test_exp2_v2f32(<2 x float> %arg) {
 define <3 x float> @test_exp2_v3f32(<3 x float> %arg) {
 ; CHECK-LABEL: define <3 x float> @test_exp2_v3f32
 ; CHECK-SAME: (<3 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call <3 x float> @_Z4exp2Dv3_f(<3 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <3 x float> @_Z4exp2Dv3_f(<3 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <3 x float> [[EXP2]]
 ;
   %exp2 = tail call <3 x float> @_Z4exp2Dv3_f(<3 x float> %arg), !fpmath !0
@@ -55,7 +55,7 @@ define <3 x float> @test_exp2_v3f32(<3 x float> %arg) {
 define <4 x float> @test_exp2_v4f32(<4 x float> %arg) {
 ; CHECK-LABEL: define <4 x float> @test_exp2_v4f32
 ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call <4 x float> @_Z4exp2Dv4_f(<4 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <4 x float> @_Z4exp2Dv4_f(<4 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <4 x float> [[EXP2]]
 ;
   %exp2 = tail call <4 x float> @_Z4exp2Dv4_f(<4 x float> %arg), !fpmath !0
@@ -65,7 +65,7 @@ define <4 x float> @test_exp2_v4f32(<4 x float> %arg) {
 define <8 x float> @test_exp2_v8f32(<8 x float> %arg) {
 ; CHECK-LABEL: define <8 x float> @test_exp2_v8f32
 ; CHECK-SAME: (<8 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call <8 x float> @_Z4exp2Dv8_f(<8 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <8 x float> @_Z4exp2Dv8_f(<8 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <8 x float> [[EXP2]]
 ;
   %exp2 = tail call <8 x float> @_Z4exp2Dv8_f(<8 x float> %arg), !fpmath !0
@@ -75,7 +75,7 @@ define <8 x float> @test_exp2_v8f32(<8 x float> %arg) {
 define <16 x float> @test_exp2_v16f32(<16 x float> %arg) {
 ; CHECK-LABEL: define <16 x float> @test_exp2_v16f32
 ; CHECK-SAME: (<16 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call <16 x float> @_Z4exp2Dv16_f(<16 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <16 x float> @_Z4exp2Dv16_f(<16 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <16 x float> [[EXP2]]
 ;
   %exp2 = tail call <16 x float> @_Z4exp2Dv16_f(<16 x float> %arg), !fpmath !0
@@ -275,7 +275,7 @@ define <16 x half> @test_exp2_v16f16(<16 x half> %arg) {
 define float @test_exp2_f32_nobuiltin_callsite(float %arg) {
 ; CHECK-LABEL: define float @test_exp2_f32_nobuiltin_callsite
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @_Z4exp2f(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @_Z4exp2f(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call float @_Z4exp2f(float %arg) #0, !fpmath !0
@@ -285,7 +285,7 @@ define float @test_exp2_f32_nobuiltin_callsite(float %arg) {
 define <2 x float> @test_exp2_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_exp2_v2f32_nobuiltin_callsite
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
   %exp2 = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -316,7 +316,7 @@ define <2 x float> @test_exp2_cr_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 define float @test_exp2_f32_nobuiltins(float %arg) #1 {
 ; CHECK-LABEL: define float @test_exp2_f32_nobuiltins
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @_Z4exp2f(float [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @_Z4exp2f(float [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call float @_Z4exp2f(float %arg) #0, !fpmath !0
@@ -326,7 +326,7 @@ define float @test_exp2_f32_nobuiltins(float %arg) #1 {
 define <2 x float> @test_exp2_v2f32_nobuiltins(<2 x float> %arg) #1 {
 ; CHECK-LABEL: define <2 x float> @test_exp2_v2f32_nobuiltins
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
   %exp2 = tail call <2 x float> @_Z4exp2Dv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -356,7 +356,7 @@ define <2 x float> @test_exp2_cr_v2f32_nobuiltins(<2 x float> %arg) #1 {
 define float @test_exp2_f32_preserve_flags(float %arg) {
 ; CHECK-LABEL: define float @test_exp2_f32_preserve_flags
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan ninf float @llvm.exp2.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan ninf float @llvm.exp2.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call nnan ninf float @_Z4exp2f(float %arg), !fpmath !0
@@ -366,7 +366,7 @@ define float @test_exp2_f32_preserve_flags(float %arg) {
 define <2 x float> @test_exp2_v2f32_preserve_flags(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_exp2_v2f32_preserve_flags
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan nsz contract <2 x float> @llvm.exp2.v2f32(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan nsz contract <2 x float> @llvm.exp2.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
   %exp2 = tail call contract nsz nnan <2 x float> @_Z4exp2Dv2_f(<2 x float> %arg), !fpmath !0
@@ -376,7 +376,7 @@ define <2 x float> @test_exp2_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_exp2_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_exp2_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan ninf float @llvm.exp2.f32(float [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan ninf float @llvm.exp2.f32(float [[ARG]]), !fpmath [[META0]], !foo [[META1:![0-9]+]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call nnan ninf float @_Z4exp2f(float %arg), !fpmath !0, !foo !1
@@ -386,7 +386,7 @@ define float @test_exp2_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_exp2_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_exp2_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan nsz contract <2 x float> @llvm.exp2.v2f32(<2 x float> [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan nsz contract <2 x float> @llvm.exp2.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]], !foo [[META1]]
 ; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
   %exp2 = tail call contract nsz nnan <2 x float> @_Z4exp2Dv2_f(<2 x float> %arg), !fpmath !0, !foo !1
@@ -440,7 +440,7 @@ define float @test_libm_exp2_f32_fast(float %arg) {
 define float @test_libm_exp2_f32_fpmath(float %arg) {
 ; CHECK-LABEL: define float @test_libm_exp2_f32_fpmath
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @exp2f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @exp2f(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call float @exp2f(float %arg), !fpmath !0
@@ -470,7 +470,7 @@ define double @test_libm_exp2_f64_fast(double %arg) {
 define double @test_libm_exp2_f64_fpmath(double %arg) {
 ; CHECK-LABEL: define double @test_libm_exp2_f64_fpmath
 ; CHECK-SAME: (double [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call double @exp2(double [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call double @exp2(double [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret double [[EXP2]]
 ;
   %exp2 = tail call double @exp2(double %arg), !fpmath !0
@@ -480,7 +480,7 @@ define double @test_libm_exp2_f64_fpmath(double %arg) {
 define float @test_exp2_f32_fast_noinline(float %arg) {
 ; CHECK-LABEL: define float @test_exp2_f32_fast_noinline
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @_Z4exp2f(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @_Z4exp2f(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call fast float @_Z4exp2f(float %arg) #3, !fpmath !0
@@ -490,7 +490,7 @@ define float @test_exp2_f32_fast_noinline(float %arg) {
 define float @test_exp2_f32_fast_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_exp2_f32_fast_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call fast float @_Z4exp2f(float %arg), !fpmath !0
@@ -500,7 +500,7 @@ define float @test_exp2_f32_fast_optsize(float %arg) #4 {
 define float @test_exp2_f32_fast_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_exp2_f32_fast_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call fast float @_Z4exp2f(float %arg), !fpmath !0
@@ -510,7 +510,7 @@ define float @test_exp2_f32_fast_minsize(float %arg) #5 {
 define float @test_exp2_f32_nsz_contract_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_exp2_f32_nsz_contract_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call nsz contract float @llvm.exp2.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call nsz contract float @llvm.exp2.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call nsz contract float @_Z4exp2f(float %arg), !fpmath !0
@@ -520,7 +520,7 @@ define float @test_exp2_f32_nsz_contract_optsize(float %arg) #4 {
 define float @test_exp2_f32_nsz_contract_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_exp2_f32_nsz_contract_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call nsz contract float @_Z4exp2f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call nsz contract float @_Z4exp2f(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %exp2 = tail call nsz contract float @_Z4exp2f(float %arg), !fpmath !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-fabs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-fabs.ll
index 09308df5ceaf1..0b05b2fb71be4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-fabs.ll
@@ -266,7 +266,7 @@ define <2 x float> @test_fabs_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_fabs_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_fabs_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[FABS:%.*]] = tail call nnan ninf float @llvm.fabs.f32(float [[ARG]]), !foo !0
+; CHECK-NEXT:    [[FABS:%.*]] = tail call nnan ninf float @llvm.fabs.f32(float [[ARG]]), !foo [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %fabs = tail call nnan ninf float @_Z4fabsf(float %arg), !foo !0
@@ -276,7 +276,7 @@ define float @test_fabs_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_fabs_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_fabs_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[FABS:%.*]] = tail call nnan nsz contract <2 x float> @llvm.fabs.v2f32(<2 x float> [[ARG]]), !foo !0
+; CHECK-NEXT:    [[FABS:%.*]] = tail call nnan nsz contract <2 x float> @llvm.fabs.v2f32(<2 x float> [[ARG]]), !foo [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[FABS]]
 ;
   %fabs = tail call contract nsz nnan <2 x float> @_Z4fabsDv2_f(<2 x float> %arg), !foo !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-floor.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-floor.ll
index 216fb2bb4fbbc..fb0d5608fd0bc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-floor.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-floor.ll
@@ -266,7 +266,7 @@ define <2 x float> @test_rint_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_rint_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_rint_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan ninf float @llvm.floor.f32(float [[ARG]]), !foo !0
+; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan ninf float @llvm.floor.f32(float [[ARG]]), !foo [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[RINT]]
 ;
   %rint = tail call nnan ninf float @_Z5floorf(float %arg), !foo !0
@@ -276,7 +276,7 @@ define float @test_rint_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_rint_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_rint_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan nsz contract <2 x float> @llvm.floor.v2f32(<2 x float> [[ARG]]), !foo !0
+; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan nsz contract <2 x float> @llvm.floor.v2f32(<2 x float> [[ARG]]), !foo [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[RINT]]
 ;
   %rint = tail call contract nsz nnan <2 x float> @_Z5floorDv2_f(<2 x float> %arg), !foo !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log.ll
index 61bf5fae73d56..70079144c56e9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log.ll
@@ -25,7 +25,7 @@ declare <16 x half> @_Z3logDv16_Dh(<16 x half>)
 define float @test_log_f32(float %arg) {
 ; CHECK-LABEL: define float @test_log_f32
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call float @_Z3logf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call float @_Z3logf(float [[ARG]]), !fpmath [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call float @_Z3logf(float %arg), !fpmath !0
@@ -35,7 +35,7 @@ define float @test_log_f32(float %arg) {
 define <2 x float> @test_log_v2f32(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log_v2f32
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call <2 x float> @_Z3logDv2_f(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call <2 x float> @_Z3logDv2_f(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG]]
 ;
   %log = tail call <2 x float> @_Z3logDv2_f(<2 x float> %arg), !fpmath !0
@@ -45,7 +45,7 @@ define <2 x float> @test_log_v2f32(<2 x float> %arg) {
 define <3 x float> @test_log_v3f32(<3 x float> %arg) {
 ; CHECK-LABEL: define <3 x float> @test_log_v3f32
 ; CHECK-SAME: (<3 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call <3 x float> @_Z3logDv3_f(<3 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call <3 x float> @_Z3logDv3_f(<3 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <3 x float> [[LOG]]
 ;
   %log = tail call <3 x float> @_Z3logDv3_f(<3 x float> %arg), !fpmath !0
@@ -55,7 +55,7 @@ define <3 x float> @test_log_v3f32(<3 x float> %arg) {
 define <4 x float> @test_log_v4f32(<4 x float> %arg) {
 ; CHECK-LABEL: define <4 x float> @test_log_v4f32
 ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call <4 x float> @_Z3logDv4_f(<4 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call <4 x float> @_Z3logDv4_f(<4 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <4 x float> [[LOG]]
 ;
   %log = tail call <4 x float> @_Z3logDv4_f(<4 x float> %arg), !fpmath !0
@@ -65,7 +65,7 @@ define <4 x float> @test_log_v4f32(<4 x float> %arg) {
 define <8 x float> @test_log_v8f32(<8 x float> %arg) {
 ; CHECK-LABEL: define <8 x float> @test_log_v8f32
 ; CHECK-SAME: (<8 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call <8 x float> @_Z3logDv8_f(<8 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call <8 x float> @_Z3logDv8_f(<8 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <8 x float> [[LOG]]
 ;
   %log = tail call <8 x float> @_Z3logDv8_f(<8 x float> %arg), !fpmath !0
@@ -75,7 +75,7 @@ define <8 x float> @test_log_v8f32(<8 x float> %arg) {
 define <16 x float> @test_log_v16f32(<16 x float> %arg) {
 ; CHECK-LABEL: define <16 x float> @test_log_v16f32
 ; CHECK-SAME: (<16 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call <16 x float> @_Z3logDv16_f(<16 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call <16 x float> @_Z3logDv16_f(<16 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <16 x float> [[LOG]]
 ;
   %log = tail call <16 x float> @_Z3logDv16_f(<16 x float> %arg), !fpmath !0
@@ -275,7 +275,7 @@ define <16 x half> @test_log_v16f16(<16 x half> %arg) {
 define float @test_log_f32_nobuiltin_callsite(float %arg) {
 ; CHECK-LABEL: define float @test_log_f32_nobuiltin_callsite
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call float @_Z3logf(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call float @_Z3logf(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call float @_Z3logf(float %arg) #0, !fpmath !0
@@ -285,7 +285,7 @@ define float @test_log_f32_nobuiltin_callsite(float %arg) {
 define <2 x float> @test_log_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log_v2f32_nobuiltin_callsite
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call <2 x float> @_Z3logDv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call <2 x float> @_Z3logDv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG]]
 ;
   %log = tail call <2 x float> @_Z3logDv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -316,7 +316,7 @@ define <2 x float> @test_log_cr_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 define float @test_log_f32_nobuiltins(float %arg) #1 {
 ; CHECK-LABEL: define float @test_log_f32_nobuiltins
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call float @_Z3logf(float [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call float @_Z3logf(float [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call float @_Z3logf(float %arg) #0, !fpmath !0
@@ -326,7 +326,7 @@ define float @test_log_f32_nobuiltins(float %arg) #1 {
 define <2 x float> @test_log_v2f32_nobuiltins(<2 x float> %arg) #1 {
 ; CHECK-LABEL: define <2 x float> @test_log_v2f32_nobuiltins
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call <2 x float> @_Z3logDv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call <2 x float> @_Z3logDv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG]]
 ;
   %log = tail call <2 x float> @_Z3logDv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -356,7 +356,7 @@ define <2 x float> @test_log_cr_v2f32_nobuiltins(<2 x float> %arg) #1 {
 define float @test_log_f32_preserve_flags(float %arg) {
 ; CHECK-LABEL: define float @test_log_f32_preserve_flags
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call nnan ninf float @llvm.log.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call nnan ninf float @llvm.log.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call nnan ninf float @_Z3logf(float %arg), !fpmath !0
@@ -366,7 +366,7 @@ define float @test_log_f32_preserve_flags(float %arg) {
 define <2 x float> @test_log_v2f32_preserve_flags(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log_v2f32_preserve_flags
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log.v2f32(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG]]
 ;
   %log = tail call contract nsz nnan <2 x float> @_Z3logDv2_f(<2 x float> %arg), !fpmath !0
@@ -376,7 +376,7 @@ define <2 x float> @test_log_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_log_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_log_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call nnan ninf float @llvm.log.f32(float [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[LOG:%.*]] = tail call nnan ninf float @llvm.log.f32(float [[ARG]]), !fpmath [[META0]], !foo [[META1:![0-9]+]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call nnan ninf float @_Z3logf(float %arg), !fpmath !0, !foo !1
@@ -386,7 +386,7 @@ define float @test_log_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_log_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log.v2f32(<2 x float> [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[LOG:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]], !foo [[META1]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG]]
 ;
   %log = tail call contract nsz nnan <2 x float> @_Z3logDv2_f(<2 x float> %arg), !fpmath !0, !foo !1
@@ -440,7 +440,7 @@ define float @test_libm_log_f32_fast(float %arg) {
 define float @test_libm_log_f32_fpmath(float %arg) {
 ; CHECK-LABEL: define float @test_libm_log_f32_fpmath
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call float @logf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call float @logf(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call float @logf(float %arg), !fpmath !0
@@ -470,7 +470,7 @@ define double @test_libm_log_f64_fast(double %arg) {
 define double @test_libm_log_f64_fpmath(double %arg) {
 ; CHECK-LABEL: define double @test_libm_log_f64_fpmath
 ; CHECK-SAME: (double [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call double @log(double [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call double @log(double [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret double [[LOG]]
 ;
   %log = tail call double @log(double %arg), !fpmath !0
@@ -480,7 +480,7 @@ define double @test_libm_log_f64_fpmath(double %arg) {
 define float @test_log_f32_fast_noinline(float %arg) {
 ; CHECK-LABEL: define float @test_log_f32_fast_noinline
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call fast float @_Z3logf(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call fast float @_Z3logf(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call fast float @_Z3logf(float %arg) #3, !fpmath !0
@@ -490,7 +490,7 @@ define float @test_log_f32_fast_noinline(float %arg) {
 define float @test_log_f32_fast_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_log_f32_fast_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call fast float @llvm.log.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call fast float @llvm.log.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call fast float @_Z3logf(float %arg), !fpmath !0
@@ -500,7 +500,7 @@ define float @test_log_f32_fast_optsize(float %arg) #4 {
 define float @test_log_f32_fast_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_log_f32_fast_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call fast float @llvm.log.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call fast float @llvm.log.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call fast float @_Z3logf(float %arg), !fpmath !0
@@ -510,7 +510,7 @@ define float @test_log_f32_fast_minsize(float %arg) #5 {
 define float @test_log_f32_nsz_contract_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_log_f32_nsz_contract_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call nsz contract float @llvm.log.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call nsz contract float @llvm.log.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call nsz contract float @_Z3logf(float %arg), !fpmath !0
@@ -520,7 +520,7 @@ define float @test_log_f32_nsz_contract_optsize(float %arg) #4 {
 define float @test_log_f32_nsz_contract_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_log_f32_nsz_contract_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:    [[LOG:%.*]] = tail call nsz contract float @_Z3logf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG:%.*]] = tail call nsz contract float @_Z3logf(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG]]
 ;
   %log = tail call nsz contract float @_Z3logf(float %arg), !fpmath !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log10.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log10.ll
index 6ae04360dc9a1..f2b27fce663fc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log10.ll
@@ -25,7 +25,7 @@ declare <16 x half> @_Z5log10Dv16_Dh(<16 x half>)
 define float @test_log10_f32(float %arg) {
 ; CHECK-LABEL: define float @test_log10_f32
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call float @_Z5log10f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call float @_Z5log10f(float [[ARG]]), !fpmath [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call float @_Z5log10f(float %arg), !fpmath !0
@@ -35,7 +35,7 @@ define float @test_log10_f32(float %arg) {
 define <2 x float> @test_log10_v2f32(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log10_v2f32
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG10]]
 ;
   %log10 = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> %arg), !fpmath !0
@@ -45,7 +45,7 @@ define <2 x float> @test_log10_v2f32(<2 x float> %arg) {
 define <3 x float> @test_log10_v3f32(<3 x float> %arg) {
 ; CHECK-LABEL: define <3 x float> @test_log10_v3f32
 ; CHECK-SAME: (<3 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call <3 x float> @_Z5log10Dv3_f(<3 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call <3 x float> @_Z5log10Dv3_f(<3 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <3 x float> [[LOG10]]
 ;
   %log10 = tail call <3 x float> @_Z5log10Dv3_f(<3 x float> %arg), !fpmath !0
@@ -55,7 +55,7 @@ define <3 x float> @test_log10_v3f32(<3 x float> %arg) {
 define <4 x float> @test_log10_v4f32(<4 x float> %arg) {
 ; CHECK-LABEL: define <4 x float> @test_log10_v4f32
 ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call <4 x float> @_Z5log10Dv4_f(<4 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call <4 x float> @_Z5log10Dv4_f(<4 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <4 x float> [[LOG10]]
 ;
   %log10 = tail call <4 x float> @_Z5log10Dv4_f(<4 x float> %arg), !fpmath !0
@@ -65,7 +65,7 @@ define <4 x float> @test_log10_v4f32(<4 x float> %arg) {
 define <8 x float> @test_log10_v8f32(<8 x float> %arg) {
 ; CHECK-LABEL: define <8 x float> @test_log10_v8f32
 ; CHECK-SAME: (<8 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call <8 x float> @_Z5log10Dv8_f(<8 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call <8 x float> @_Z5log10Dv8_f(<8 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <8 x float> [[LOG10]]
 ;
   %log10 = tail call <8 x float> @_Z5log10Dv8_f(<8 x float> %arg), !fpmath !0
@@ -75,7 +75,7 @@ define <8 x float> @test_log10_v8f32(<8 x float> %arg) {
 define <16 x float> @test_log10_v16f32(<16 x float> %arg) {
 ; CHECK-LABEL: define <16 x float> @test_log10_v16f32
 ; CHECK-SAME: (<16 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call <16 x float> @_Z5log10Dv16_f(<16 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call <16 x float> @_Z5log10Dv16_f(<16 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <16 x float> [[LOG10]]
 ;
   %log10 = tail call <16 x float> @_Z5log10Dv16_f(<16 x float> %arg), !fpmath !0
@@ -275,7 +275,7 @@ define <16 x half> @test_log10_v16f16(<16 x half> %arg) {
 define float @test_log10_f32_nobuiltin_callsite(float %arg) {
 ; CHECK-LABEL: define float @test_log10_f32_nobuiltin_callsite
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call float @_Z5log10f(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call float @_Z5log10f(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call float @_Z5log10f(float %arg) #0, !fpmath !0
@@ -285,7 +285,7 @@ define float @test_log10_f32_nobuiltin_callsite(float %arg) {
 define <2 x float> @test_log10_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log10_v2f32_nobuiltin_callsite
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG10]]
 ;
   %log10 = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -316,7 +316,7 @@ define <2 x float> @test_log10_cr_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 define float @test_log10_f32_nobuiltins(float %arg) #1 {
 ; CHECK-LABEL: define float @test_log10_f32_nobuiltins
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call float @_Z5log10f(float [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call float @_Z5log10f(float [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call float @_Z5log10f(float %arg) #0, !fpmath !0
@@ -326,7 +326,7 @@ define float @test_log10_f32_nobuiltins(float %arg) #1 {
 define <2 x float> @test_log10_v2f32_nobuiltins(<2 x float> %arg) #1 {
 ; CHECK-LABEL: define <2 x float> @test_log10_v2f32_nobuiltins
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG10]]
 ;
   %log10 = tail call <2 x float> @_Z5log10Dv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -356,7 +356,7 @@ define <2 x float> @test_log10_cr_v2f32_nobuiltins(<2 x float> %arg) #1 {
 define float @test_log10_f32_preserve_flags(float %arg) {
 ; CHECK-LABEL: define float @test_log10_f32_preserve_flags
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call nnan ninf float @llvm.log10.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call nnan ninf float @llvm.log10.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call nnan ninf float @_Z5log10f(float %arg), !fpmath !0
@@ -366,7 +366,7 @@ define float @test_log10_f32_preserve_flags(float %arg) {
 define <2 x float> @test_log10_v2f32_preserve_flags(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log10_v2f32_preserve_flags
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log10.v2f32(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log10.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG10]]
 ;
   %log10 = tail call contract nsz nnan <2 x float> @_Z5log10Dv2_f(<2 x float> %arg), !fpmath !0
@@ -376,7 +376,7 @@ define <2 x float> @test_log10_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_log10_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_log10_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call nnan ninf float @llvm.log10.f32(float [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call nnan ninf float @llvm.log10.f32(float [[ARG]]), !fpmath [[META0]], !foo [[META1:![0-9]+]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call nnan ninf float @_Z5log10f(float %arg), !fpmath !0, !foo !1
@@ -386,7 +386,7 @@ define float @test_log10_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_log10_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log10_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log10.v2f32(<2 x float> [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log10.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]], !foo [[META1]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG10]]
 ;
   %log10 = tail call contract nsz nnan <2 x float> @_Z5log10Dv2_f(<2 x float> %arg), !fpmath !0, !foo !1
@@ -440,7 +440,7 @@ define float @test_libm_log10_f32_fast(float %arg) {
 define float @test_libm_log10_f32_fpmath(float %arg) {
 ; CHECK-LABEL: define float @test_libm_log10_f32_fpmath
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call float @log10f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call float @log10f(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call float @log10f(float %arg), !fpmath !0
@@ -470,7 +470,7 @@ define double @test_libm_log10_f64_fast(double %arg) {
 define double @test_libm_log10_f64_fpmath(double %arg) {
 ; CHECK-LABEL: define double @test_libm_log10_f64_fpmath
 ; CHECK-SAME: (double [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call double @log10(double [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call double @log10(double [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret double [[LOG10]]
 ;
   %log10 = tail call double @log10(double %arg), !fpmath !0
@@ -480,7 +480,7 @@ define double @test_libm_log10_f64_fpmath(double %arg) {
 define float @test_log10_f32_fast_noinline(float %arg) {
 ; CHECK-LABEL: define float @test_log10_f32_fast_noinline
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call fast float @_Z5log10f(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call fast float @_Z5log10f(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call fast float @_Z5log10f(float %arg) #3, !fpmath !0
@@ -490,7 +490,7 @@ define float @test_log10_f32_fast_noinline(float %arg) {
 define float @test_log10_f32_fast_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_log10_f32_fast_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call fast float @llvm.log10.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call fast float @llvm.log10.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call fast float @_Z5log10f(float %arg), !fpmath !0
@@ -500,7 +500,7 @@ define float @test_log10_f32_fast_optsize(float %arg) #4 {
 define float @test_log10_f32_fast_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_log10_f32_fast_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call fast float @llvm.log10.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call fast float @llvm.log10.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call fast float @_Z5log10f(float %arg), !fpmath !0
@@ -510,7 +510,7 @@ define float @test_log10_f32_fast_minsize(float %arg) #5 {
 define float @test_log10_f32_nsz_contract_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_log10_f32_nsz_contract_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call nsz contract float @llvm.log10.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call nsz contract float @llvm.log10.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call nsz contract float @_Z5log10f(float %arg), !fpmath !0
@@ -520,7 +520,7 @@ define float @test_log10_f32_nsz_contract_optsize(float %arg) #4 {
 define float @test_log10_f32_nsz_contract_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_log10_f32_nsz_contract_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:    [[LOG10:%.*]] = tail call nsz contract float @_Z5log10f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG10:%.*]] = tail call nsz contract float @_Z5log10f(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG10]]
 ;
   %log10 = tail call nsz contract float @_Z5log10f(float %arg), !fpmath !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log2.ll
index 77e12b5d28268..b7818daab4bbe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log2.ll
@@ -25,7 +25,7 @@ declare <16 x half> @_Z4log2Dv16_Dh(<16 x half>)
 define float @test_log2_f32(float %arg) {
 ; CHECK-LABEL: define float @test_log2_f32
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call float @_Z4log2f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call float @_Z4log2f(float [[ARG]]), !fpmath [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call float @_Z4log2f(float %arg), !fpmath !0
@@ -35,7 +35,7 @@ define float @test_log2_f32(float %arg) {
 define <2 x float> @test_log2_v2f32(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log2_v2f32
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG2]]
 ;
   %log2 = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> %arg), !fpmath !0
@@ -45,7 +45,7 @@ define <2 x float> @test_log2_v2f32(<2 x float> %arg) {
 define <3 x float> @test_log2_v3f32(<3 x float> %arg) {
 ; CHECK-LABEL: define <3 x float> @test_log2_v3f32
 ; CHECK-SAME: (<3 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call <3 x float> @_Z4log2Dv3_f(<3 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call <3 x float> @_Z4log2Dv3_f(<3 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <3 x float> [[LOG2]]
 ;
   %log2 = tail call <3 x float> @_Z4log2Dv3_f(<3 x float> %arg), !fpmath !0
@@ -55,7 +55,7 @@ define <3 x float> @test_log2_v3f32(<3 x float> %arg) {
 define <4 x float> @test_log2_v4f32(<4 x float> %arg) {
 ; CHECK-LABEL: define <4 x float> @test_log2_v4f32
 ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call <4 x float> @_Z4log2Dv4_f(<4 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call <4 x float> @_Z4log2Dv4_f(<4 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <4 x float> [[LOG2]]
 ;
   %log2 = tail call <4 x float> @_Z4log2Dv4_f(<4 x float> %arg), !fpmath !0
@@ -65,7 +65,7 @@ define <4 x float> @test_log2_v4f32(<4 x float> %arg) {
 define <8 x float> @test_log2_v8f32(<8 x float> %arg) {
 ; CHECK-LABEL: define <8 x float> @test_log2_v8f32
 ; CHECK-SAME: (<8 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call <8 x float> @_Z4log2Dv8_f(<8 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call <8 x float> @_Z4log2Dv8_f(<8 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <8 x float> [[LOG2]]
 ;
   %log2 = tail call <8 x float> @_Z4log2Dv8_f(<8 x float> %arg), !fpmath !0
@@ -75,7 +75,7 @@ define <8 x float> @test_log2_v8f32(<8 x float> %arg) {
 define <16 x float> @test_log2_v16f32(<16 x float> %arg) {
 ; CHECK-LABEL: define <16 x float> @test_log2_v16f32
 ; CHECK-SAME: (<16 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call <16 x float> @_Z4log2Dv16_f(<16 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call <16 x float> @_Z4log2Dv16_f(<16 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <16 x float> [[LOG2]]
 ;
   %log2 = tail call <16 x float> @_Z4log2Dv16_f(<16 x float> %arg), !fpmath !0
@@ -275,7 +275,7 @@ define <16 x half> @test_log2_v16f16(<16 x half> %arg) {
 define float @test_log2_f32_nobuiltin_callsite(float %arg) {
 ; CHECK-LABEL: define float @test_log2_f32_nobuiltin_callsite
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call float @_Z4log2f(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call float @_Z4log2f(float [[ARG]]) #[[ATTR6:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call float @_Z4log2f(float %arg) #0, !fpmath !0
@@ -285,7 +285,7 @@ define float @test_log2_f32_nobuiltin_callsite(float %arg) {
 define <2 x float> @test_log2_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log2_v2f32_nobuiltin_callsite
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG2]]
 ;
   %log2 = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -316,7 +316,7 @@ define <2 x float> @test_log2_cr_v2f32_nobuiltin_callsite(<2 x float> %arg) {
 define float @test_log2_f32_nobuiltins(float %arg) #1 {
 ; CHECK-LABEL: define float @test_log2_f32_nobuiltins
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call float @_Z4log2f(float [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call float @_Z4log2f(float [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call float @_Z4log2f(float %arg) #0, !fpmath !0
@@ -326,7 +326,7 @@ define float @test_log2_f32_nobuiltins(float %arg) #1 {
 define <2 x float> @test_log2_v2f32_nobuiltins(<2 x float> %arg) #1 {
 ; CHECK-LABEL: define <2 x float> @test_log2_v2f32_nobuiltins
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> [[ARG]]) #[[ATTR6]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG2]]
 ;
   %log2 = tail call <2 x float> @_Z4log2Dv2_f(<2 x float> %arg) #0, !fpmath !0
@@ -356,7 +356,7 @@ define <2 x float> @test_log2_cr_v2f32_nobuiltins(<2 x float> %arg) #1 {
 define float @test_log2_f32_preserve_flags(float %arg) {
 ; CHECK-LABEL: define float @test_log2_f32_preserve_flags
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call nnan ninf float @llvm.log2.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call nnan ninf float @llvm.log2.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call nnan ninf float @_Z4log2f(float %arg), !fpmath !0
@@ -366,7 +366,7 @@ define float @test_log2_f32_preserve_flags(float %arg) {
 define <2 x float> @test_log2_v2f32_preserve_flags(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log2_v2f32_preserve_flags
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log2.v2f32(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log2.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG2]]
 ;
   %log2 = tail call contract nsz nnan <2 x float> @_Z4log2Dv2_f(<2 x float> %arg), !fpmath !0
@@ -376,7 +376,7 @@ define <2 x float> @test_log2_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_log2_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_log2_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call nnan ninf float @llvm.log2.f32(float [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call nnan ninf float @llvm.log2.f32(float [[ARG]]), !fpmath [[META0]], !foo [[META1:![0-9]+]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call nnan ninf float @_Z4log2f(float %arg), !fpmath !0, !foo !1
@@ -386,7 +386,7 @@ define float @test_log2_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_log2_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_log2_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log2.v2f32(<2 x float> [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call nnan nsz contract <2 x float> @llvm.log2.v2f32(<2 x float> [[ARG]]), !fpmath [[META0]], !foo [[META1]]
 ; CHECK-NEXT:    ret <2 x float> [[LOG2]]
 ;
   %log2 = tail call contract nsz nnan <2 x float> @_Z4log2Dv2_f(<2 x float> %arg), !fpmath !0, !foo !1
@@ -440,7 +440,7 @@ define float @test_libm_log2_f32_fast(float %arg) {
 define float @test_libm_log2_f32_fpmath(float %arg) {
 ; CHECK-LABEL: define float @test_libm_log2_f32_fpmath
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call float @log2f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call float @log2f(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call float @log2f(float %arg), !fpmath !0
@@ -470,7 +470,7 @@ define double @test_libm_log2_f64_fast(double %arg) {
 define double @test_libm_log2_f64_fpmath(double %arg) {
 ; CHECK-LABEL: define double @test_libm_log2_f64_fpmath
 ; CHECK-SAME: (double [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call double @log2(double [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call double @log2(double [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret double [[LOG2]]
 ;
   %log2 = tail call double @log2(double %arg), !fpmath !0
@@ -480,7 +480,7 @@ define double @test_libm_log2_f64_fpmath(double %arg) {
 define float @test_log2_f32_fast_noinline(float %arg) {
 ; CHECK-LABEL: define float @test_log2_f32_fast_noinline
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call fast float @_Z4log2f(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call fast float @_Z4log2f(float [[ARG]]) #[[ATTR7:[0-9]+]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call fast float @_Z4log2f(float %arg) #3, !fpmath !0
@@ -490,7 +490,7 @@ define float @test_log2_f32_fast_noinline(float %arg) {
 define float @test_log2_f32_fast_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_log2_f32_fast_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call fast float @llvm.log2.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call fast float @llvm.log2.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call fast float @_Z4log2f(float %arg), !fpmath !0
@@ -500,7 +500,7 @@ define float @test_log2_f32_fast_optsize(float %arg) #4 {
 define float @test_log2_f32_fast_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_log2_f32_fast_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call fast float @llvm.log2.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call fast float @llvm.log2.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call fast float @_Z4log2f(float %arg), !fpmath !0
@@ -510,7 +510,7 @@ define float @test_log2_f32_fast_minsize(float %arg) #5 {
 define float @test_log2_f32_nsz_contract_optsize(float %arg) #4 {
 ; CHECK-LABEL: define float @test_log2_f32_nsz_contract_optsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call nsz contract float @llvm.log2.f32(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call nsz contract float @llvm.log2.f32(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call nsz contract float @_Z4log2f(float %arg), !fpmath !0
@@ -520,7 +520,7 @@ define float @test_log2_f32_nsz_contract_optsize(float %arg) #4 {
 define float @test_log2_f32_nsz_contract_minsize(float %arg) #5 {
 ; CHECK-LABEL: define float @test_log2_f32_nsz_contract_minsize
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:    [[LOG2:%.*]] = tail call nsz contract float @_Z4log2f(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    [[LOG2:%.*]] = tail call nsz contract float @_Z4log2f(float [[ARG]]), !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[LOG2]]
 ;
   %log2 = tail call nsz contract float @_Z4log2f(float %arg), !fpmath !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rint.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rint.ll
index debf96b2b4bb4..fe4778c1b3877 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rint.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rint.ll
@@ -266,7 +266,7 @@ define <2 x float> @test_rint_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_rint_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_rint_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan ninf float @llvm.rint.f32(float [[ARG]]), !foo !0
+; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan ninf float @llvm.rint.f32(float [[ARG]]), !foo [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[RINT]]
 ;
   %rint = tail call nnan ninf float @_Z4rintf(float %arg), !foo !0
@@ -276,7 +276,7 @@ define float @test_rint_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_rint_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_rint_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan nsz contract <2 x float> @llvm.rint.v2f32(<2 x float> [[ARG]]), !foo !0
+; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan nsz contract <2 x float> @llvm.rint.v2f32(<2 x float> [[ARG]]), !foo [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[RINT]]
 ;
   %rint = tail call contract nsz nnan <2 x float> @_Z4rintDv2_f(<2 x float> %arg), !foo !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-round.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-round.ll
index 41e16ba22485a..18ec7b25e1c96 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-round.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-round.ll
@@ -266,7 +266,7 @@ define <2 x float> @test_rint_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_rint_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_rint_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan ninf float @llvm.round.f32(float [[ARG]]), !foo !0
+; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan ninf float @llvm.round.f32(float [[ARG]]), !foo [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[RINT]]
 ;
   %rint = tail call nnan ninf float @_Z5roundf(float %arg), !foo !0
@@ -276,7 +276,7 @@ define float @test_rint_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_rint_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_rint_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan nsz contract <2 x float> @llvm.round.v2f32(<2 x float> [[ARG]]), !foo !0
+; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan nsz contract <2 x float> @llvm.round.v2f32(<2 x float> [[ARG]]), !foo [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[RINT]]
 ;
   %rint = tail call contract nsz nnan <2 x float> @_Z5roundDv2_f(<2 x float> %arg), !foo !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
index 34777eff0e856..0874d8a3b8a23 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
@@ -881,7 +881,7 @@ entry:
 
 define float @sincos_f32_unused_result_cos(float %x) {
 ; CHECK-LABEL: define float @sincos_f32_unused_result_cos
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SIN:%.*]] = tail call contract float @_Z3sinf(float [[X]])
 ; CHECK-NEXT:    ret float [[SIN]]
@@ -896,7 +896,7 @@ entry:
 
 define float @sincos_f32_unused_result_sin(float %x) {
 ; CHECK-LABEL: define float @sincos_f32_unused_result_sin
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR7]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS:%.*]] = tail call contract float @_Z3cosf(float [[X]])
 ; CHECK-NEXT:    ret float [[COS]]
@@ -911,7 +911,7 @@ entry:
 
 define void @sincos_f32_repeated_uses(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_repeated_uses
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-trunc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-trunc.ll
index 0dfd398c0ac99..5a07f87dbef3b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-trunc.ll
@@ -266,7 +266,7 @@ define <2 x float> @test_rint_v2f32_preserve_flags(<2 x float> %arg) {
 define float @test_rint_f32_preserve_flags_md(float %arg) {
 ; CHECK-LABEL: define float @test_rint_f32_preserve_flags_md
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan ninf float @llvm.trunc.f32(float [[ARG]]), !foo !0
+; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan ninf float @llvm.trunc.f32(float [[ARG]]), !foo [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[RINT]]
 ;
   %rint = tail call nnan ninf float @_Z5truncf(float %arg), !foo !0
@@ -276,7 +276,7 @@ define float @test_rint_f32_preserve_flags_md(float %arg) {
 define <2 x float> @test_rint_v2f32_preserve_flags_md(<2 x float> %arg) {
 ; CHECK-LABEL: define <2 x float> @test_rint_v2f32_preserve_flags_md
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan nsz contract <2 x float> @llvm.trunc.v2f32(<2 x float> [[ARG]]), !foo !0
+; CHECK-NEXT:    [[RINT:%.*]] = tail call nnan nsz contract <2 x float> @llvm.trunc.v2f32(<2 x float> [[ARG]]), !foo [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[RINT]]
 ;
   %rint = tail call contract nsz nnan <2 x float> @_Z5truncDv2_f(<2 x float> %arg), !foo !0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
index 2776b9187724c..ca72b84e5f20b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
@@ -7,12 +7,14 @@
 @lds_3 = external addrspace(3) global [0 x i8], align 4
 @lds_4 = external addrspace(3) global [0 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
index 8cbeb80d62335..ccb63cc670650 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
@@ -7,12 +7,14 @@
 @lds_3 = external addrspace(3) global [0 x i8], align 4
 @lds_4 = external addrspace(3) global [0 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
@@ -44,7 +46,7 @@ define void @use_variables() sanitize_address {
 
 define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @k0(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
 ; CHECK-NEXT:  WId:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -138,5 +140,6 @@ define amdgpu_kernel void @k0() sanitize_address {
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 8, i32 9}
-; CHECK: [[META2]] = !{i32 0}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[META3]] = !{i32 0}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
index f33b30119754f..4cd641b5b5547 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
@@ -66,7 +66,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP38:%.*]] = and i1 [[TMP34]], [[TMP37]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP38]])
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i64 [[TMP39]], 0
-; CHECK-NEXT:    br i1 [[TMP40]], label [[ASAN_REPORT:%.*]], label [[TMP43:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[ASAN_REPORT:%.*]], label [[TMP43:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       asan.report:
 ; CHECK-NEXT:    br i1 [[TMP38]], label [[TMP41:%.*]], label [[CONDFREE:%.*]]
 ; CHECK:       41:
@@ -109,5 +109,6 @@ define amdgpu_kernel void @k0() sanitize_address {
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 8, i32 9}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
index 5e90eb0b95219..5e37c9e2327bc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
@@ -84,4 +84,5 @@ define amdgpu_kernel void @k0() sanitize_address {
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll
index f30a382a62c6b..70409254ed41d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll
@@ -6,6 +6,12 @@
 @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
 @lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
 
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.k1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.sw.lds.k1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k1.md.type { %llvm.amdgcn.sw.lds.k1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k1.md.item { i32 32, i32 4, i32 32 } }, no_sanitize_address
+;.
 define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @k0(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -125,3 +131,13 @@ define amdgpu_kernel void @k1() {
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 4, !"nosanitize_address", i32 1}
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
index d0caddb7934a7..3f7edc2901a31 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
@@ -65,7 +65,7 @@ define void @use_variables_2() sanitize_address {
 
 define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @k0(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
 ; CHECK-NEXT:  WId:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -140,7 +140,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 
 define amdgpu_kernel void @k1() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @k1(
-; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
 ; CHECK-NEXT:  WId:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -222,6 +222,6 @@ define amdgpu_kernel void @k1() sanitize_address {
 !0 = !{i32 4, !"nosanitize_address", i32 1}
 
 ;.
-; CHECK: [[META2]] = !{i32 0}
-; CHECK: [[META3]] = !{i32 1}
+; CHECK: [[META3]] = !{i32 0}
+; CHECK: [[META4]] = !{i32 1}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll
index 07baf90e370d1..2c91816419f91 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll
@@ -109,4 +109,5 @@ ret void
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
index 6848e2c06c1e1..043d0a8852726 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
@@ -109,4 +109,5 @@ ret void
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
index a6e6b84bba304..efb808d1e5956 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
@@ -3,11 +3,13 @@
 @lds = external addrspace(3) global [5 x i8], align 8
 declare void @non_kernel_declaration() sanitize_address
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k1], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [1 x ptr addrspace(1)]] [[1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k1.md.type { %llvm.amdgcn.sw.lds.k1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k1.md.item { i32 32, i32 5, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k1], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [1 x ptr addrspace(1)]] [[1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k1], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [1 x ptr addrspace(1)]] [[1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 0)]], no_sanitize_address
 ;.
 define void @non_kernel_function() sanitize_address {
 ; CHECK-LABEL: define void @non_kernel_function(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
index 40b1305a3b12c..1e03371d520dd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
@@ -8,12 +8,14 @@
 @lds_3 = external addrspace(3) global [0 x i8], align 4
 @lds_4 = external addrspace(3) global [0 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
index 0cc49c94e2279..48a7fa2be6195 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
@@ -8,12 +8,14 @@
 @lds_3 = external addrspace(3) global [0 x i8], align 4
 @lds_4 = external addrspace(3) global [0 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
@@ -45,7 +47,7 @@ define void @use_variables() sanitize_address {
 
 define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @k0(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
 ; CHECK-NEXT:  WId:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -139,5 +141,6 @@ define amdgpu_kernel void @k0() sanitize_address {
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 8, i32 9}
-; CHECK: [[META2]] = !{i32 0}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[META3]] = !{i32 0}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
index f2cdc4c812db1..513d416b41bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
@@ -88,7 +88,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP53:%.*]] = and i1 [[TMP49]], [[TMP52]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP53]])
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp ne i64 [[TMP54]], 0
-; CHECK-NEXT:    br i1 [[TMP55]], label [[ASAN_REPORT:%.*]], label [[TMP58:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP55]], label [[ASAN_REPORT:%.*]], label [[TMP58:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       asan.report:
 ; CHECK-NEXT:    br i1 [[TMP53]], label [[TMP56:%.*]], label [[CONDFREE:%.*]]
 ; CHECK:       56:
@@ -114,7 +114,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP71:%.*]] = and i1 [[TMP66]], [[TMP70]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP71]])
 ; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i64 [[TMP72]], 0
-; CHECK-NEXT:    br i1 [[TMP73]], label [[ASAN_REPORT1:%.*]], label [[TMP76:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP73]], label [[ASAN_REPORT1:%.*]], label [[TMP76:%.*]], !prof [[PROF3]]
 ; CHECK:       asan.report1:
 ; CHECK-NEXT:    br i1 [[TMP71]], label [[TMP74:%.*]], label [[TMP75:%.*]]
 ; CHECK:       74:
@@ -139,7 +139,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP88:%.*]] = and i1 [[TMP84]], [[TMP87]]
 ; CHECK-NEXT:    [[TMP89:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP88]])
 ; CHECK-NEXT:    [[TMP90:%.*]] = icmp ne i64 [[TMP89]], 0
-; CHECK-NEXT:    br i1 [[TMP90]], label [[ASAN_REPORT2:%.*]], label [[TMP93:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP90]], label [[ASAN_REPORT2:%.*]], label [[TMP93:%.*]], !prof [[PROF3]]
 ; CHECK:       asan.report2:
 ; CHECK-NEXT:    br i1 [[TMP88]], label [[TMP91:%.*]], label [[TMP92:%.*]]
 ; CHECK:       91:
@@ -164,7 +164,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK-NEXT:    [[TMP105:%.*]] = and i1 [[TMP101]], [[TMP104]]
 ; CHECK-NEXT:    [[TMP106:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP105]])
 ; CHECK-NEXT:    [[TMP107:%.*]] = icmp ne i64 [[TMP106]], 0
-; CHECK-NEXT:    br i1 [[TMP107]], label [[ASAN_REPORT3:%.*]], label [[TMP110:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP107]], label [[ASAN_REPORT3:%.*]], label [[TMP110:%.*]], !prof [[PROF3]]
 ; CHECK:       asan.report3:
 ; CHECK-NEXT:    br i1 [[TMP105]], label [[TMP108:%.*]], label [[TMP109:%.*]]
 ; CHECK:       108:
@@ -209,5 +209,6 @@ define amdgpu_kernel void @k0() sanitize_address {
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 8, i32 9}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
index e0bfca0f63ca7..73430c29b56b8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
@@ -117,4 +117,5 @@ define amdgpu_kernel void @k0() sanitize_address {
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
index b9b4c90daea87..c3f8b8bc176c3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
@@ -7,11 +7,13 @@
 @lds_3 = external addrspace(3) global [3 x i8], align 4
 @lds_4 = external addrspace(3) global [4 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
index a70db2259cc3f..5ce3bb5209779 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
@@ -6,10 +6,11 @@
 
 @lds_var = internal addrspace(3) global [1024 x i32] poison, align 4
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.my_kernel = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.my_kernel.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.my_kernel.md.type { %llvm.amdgcn.sw.lds.my_kernel.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.my_kernel.md.item { i32 32, i32 4096, i32 5120 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel], no_sanitize_address
 ;.
 define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 ; CHECK-LABEL: define void @my_function(
@@ -33,7 +34,7 @@ define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[TMP12]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP17]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP18]], 0
-; CHECK-NEXT:    br i1 [[TMP19]], label [[ASAN_REPORT:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[ASAN_REPORT:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       asan.report:
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP20:%.*]], label [[TMP21:%.*]]
 ; CHECK:       20:
@@ -60,7 +61,7 @@ define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP36:%.*]] = and i1 [[TMP31]], [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP36]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne i64 [[TMP37]], 0
-; CHECK-NEXT:    br i1 [[TMP38]], label [[ASAN_REPORT1:%.*]], label [[TMP41:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[ASAN_REPORT1:%.*]], label [[TMP41:%.*]], !prof [[PROF2]]
 ; CHECK:       asan.report1:
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP40:%.*]]
 ; CHECK:       39:
@@ -81,7 +82,7 @@ define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 
 define amdgpu_kernel void @my_kernel() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @my_kernel(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
 ; CHECK-NEXT:  WId:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -147,6 +148,7 @@ define amdgpu_kernel void @my_kernel() sanitize_address {
 ; CHECK: attributes #[[ATTR7]] = { nomerge }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
-; CHECK: [[META2]] = !{i32 0}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META3]] = !{i32 0}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
index 55a36f85dc73a..c6734019dd774 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
@@ -35,7 +35,7 @@ define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
 
 define amdgpu_kernel void @my_kernel() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @my_kernel(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
 ; CHECK-NEXT:  WId:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -98,5 +98,6 @@ define amdgpu_kernel void @my_kernel() sanitize_address {
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
-; CHECK: [[META1]] = !{i32 0}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[META2]] = !{i32 0}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll
index 4625a7f626f9b..8bce2ebab1868 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll
@@ -1,19 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
 
-; Test to check if static LDS is lowered correctly when a non-kernel without sanitize_address attr with LDS accesses is called from 
+; Test to check if static LDS is lowered correctly when a non-kernel without sanitize_address attr with LDS accesses is called from
 ; kernel which has sanitize_address attr.
 @lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
 @lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
 @lds_3 = external addrspace(3) global [3 x i8], align 4
 @lds_4 = external addrspace(3) global [4 x i8], align 8
 
-;.
 ; @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address
 ; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
 ; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+;.
 define void @use_variables() {
 ; CHECK-LABEL: define void @use_variables() {
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
@@ -125,3 +129,7 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[META2]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-none.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-none.ll
index 5dbab5643b929..cb3fe7691524b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-none.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-none.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
 
-; Test to check if LDS is not lowered when a non-kernel with sanitize_address attr and with LDS accesses is called from 
+; Test to check if LDS is not lowered when a non-kernel with sanitize_address attr and with LDS accesses is called from
 ; kernel which doesn't have sanitize_address attr.
 @lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
 @lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
index 255dda562c1ea..a22c0f4a5c7b0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
@@ -6,7 +6,6 @@
 @A = external addrspace(3) global [8 x ptr]
 @B = external addrspace(3) global [0 x i32]
 
-;.
 ; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address
 ; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]]
@@ -20,6 +19,19 @@
 ; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address
 ; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address
 ;.
+; CHECK: @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address
+;.
 define amdgpu_kernel void @kernel_0() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_0(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
@@ -292,4 +304,16 @@ define private ptr @get_B_ptr() sanitize_address {
 ;.
 ; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
 ; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR2]] = { sanitize_address }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[META3]] = !{i32 0}
+; CHECK: [[META4]] = !{i32 1}
+; CHECK: [[META5]] = !{i32 2}
+; CHECK: [[META6]] = !{i32 3}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
index 7184ebbb8faa3..fc6183cb87491 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
@@ -6,7 +6,6 @@
 @A = external addrspace(3) global [8 x ptr]
 @B = external addrspace(3) global [0 x i32]
 
-;.
 ; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address
 ; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]]
@@ -20,6 +19,19 @@
 ; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address
 ; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address
 ;.
+; CHECK: @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address
+;.
 define amdgpu_kernel void @kernel_0() sanitize_address {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_0(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
@@ -292,4 +304,16 @@ define private ptr @get_B_ptr() sanitize_address {
 ;.
 ; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
 ; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR2]] = { sanitize_address }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[META3]] = !{i32 0}
+; CHECK: [[META4]] = !{i32 1}
+; CHECK: [[META5]] = !{i32 2}
+; CHECK: [[META6]] = !{i32 3}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll
index 704bc9e635294..9378e2e60647d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll
@@ -8,11 +8,13 @@
 @lds_3 = external addrspace(3) global [3 x i8], align 4
 @lds_4 = external addrspace(3) global [4 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
index 8f5abe962f8eb..e3681d0d04836 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
@@ -7,11 +7,13 @@
 @lds_3 = external addrspace(3) global [3 x i8], align 4
 @lds_4 = external addrspace(3) global [4 x i8], align 8
 
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 ; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address
-; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
-; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
 ;.
 define void @use_variables() sanitize_address {
 ; CHECK-LABEL: define void @use_variables(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
index 066b9429425ac..e03e8af0fabcf 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
@@ -83,4 +83,5 @@ define amdgpu_kernel void @k0() sanitize_address {
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index e6f02295e67d5..3f699a5ca218b 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -98,14 +98,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_i32 s0, s2, s2
-; GFX7-NEXT:    s_cmp_lt_u32 s0, s2
-; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX7-NEXT:    v_add_i32_e64 v0, s[0:1], s2, s2
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX7-NEXT:    s_addc_u32 s0, s2, 0
-; GFX7-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v0
+; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX7-NEXT:  ; %bb.1: ; %bb0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
@@ -125,13 +123,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s2, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s0, s2
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], s2, s2
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_addc_u32 s0, s2, 0
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -151,13 +147,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_i32 s1, s0, s0
-; GFX10-NEXT:    s_cmp_lt_u32 s1, s0
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX10-NEXT:    v_add_co_u32 v0, s1, s0, s0
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
-; GFX10-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s0, v0
+; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -177,15 +171,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_add_i32 s1, s0, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lt_u32 s1, s0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    v_add_co_u32 v0, s1, s0, s0
 ; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s0, v0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index e74ad3d62bea4..47161954cc332 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -8946,8 +8946,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:  .LBB141_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_i32_e32 v3, vcc, -1, v4
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN1-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v4
 ; GCN1-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; GCN1-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN1-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -8971,8 +8970,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:  .LBB141_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v3, vcc, -1, v4
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN2-NEXT:    v_subrev_u32_e32 v3, vcc, 1, v4
 ; GCN2-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; GCN2-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN2-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -8996,9 +8994,8 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:  .LBB141_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN3-NEXT:    v_subrev_co_u32_e32 v3, vcc, 1, v4
 ; GCN3-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
-; GCN3-NEXT:    v_add_u32_e32 v3, -1, v4
 ; GCN3-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -9027,8 +9024,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:  .LBB142_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_i32_e32 v3, vcc, -1, v4
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN1-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v4
 ; GCN1-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; GCN1-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN1-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9054,8 +9050,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:  .LBB142_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v3, vcc, -1, v4
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN2-NEXT:    v_subrev_u32_e32 v3, vcc, 1, v4
 ; GCN2-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; GCN2-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN2-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9079,9 +9074,8 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:  .LBB142_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN3-NEXT:    v_subrev_co_u32_e32 v3, vcc, 1, v4
 ; GCN3-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
-; GCN3-NEXT:    v_add_u32_e32 v3, -1, v4
 ; GCN3-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
@@ -9110,8 +9104,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
-; GCN1-NEXT:    v_add_i32_e32 v3, vcc, -1, v4
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN1-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v4
 ; GCN1-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; GCN1-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN1-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9136,8 +9129,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
-; GCN2-NEXT:    v_add_u32_e32 v3, vcc, -1, v4
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN2-NEXT:    v_subrev_u32_e32 v3, vcc, 1, v4
 ; GCN2-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; GCN2-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN2-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9162,9 +9154,8 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN3-NEXT:    v_subrev_co_u32_e32 v3, vcc, 1, v4
 ; GCN3-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
-; GCN3-NEXT:    v_add_u32_e32 v3, -1, v4
 ; GCN3-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -9194,8 +9185,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    v_add_i32_e32 v0, vcc, -1, v1
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN1-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v1
 ; GCN1-NEXT:    v_cmp_gt_u32_e64 s[4:5], v1, v2
 ; GCN1-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN1-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -9221,8 +9211,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    v_add_u32_e32 v0, vcc, -1, v1
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN2-NEXT:    v_subrev_u32_e32 v0, vcc, 1, v1
 ; GCN2-NEXT:    v_cmp_gt_u32_e64 s[4:5], v1, v2
 ; GCN2-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN2-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -9246,9 +9235,8 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN3-NEXT:    v_subrev_co_u32_e32 v3, vcc, 1, v4
 ; GCN3-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
-; GCN3-NEXT:    v_add_u32_e32 v3, -1, v4
 ; GCN3-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
@@ -9279,8 +9267,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN1-NEXT:  .LBB145_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_i32_e32 v2, vcc, -1, v3
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN1-NEXT:    v_subrev_i32_e32 v2, vcc, 1, v3
 ; GCN1-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v3
 ; GCN1-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN1-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9307,8 +9294,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN2-NEXT:  .LBB145_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v2, vcc, -1, v3
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN2-NEXT:    v_subrev_u32_e32 v2, vcc, 1, v3
 ; GCN2-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v3
 ; GCN2-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN2-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9335,9 +9321,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN3-NEXT:  .LBB145_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN3-NEXT:    v_subrev_co_u32_e32 v2, vcc, 1, v3
 ; GCN3-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v3
-; GCN3-NEXT:    v_add_u32_e32 v2, -1, v3
 ; GCN3-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
@@ -9369,8 +9354,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN1-NEXT:  .LBB146_1: ; %atomicrmw.start
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_i32_e32 v2, vcc, -1, v3
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN1-NEXT:    v_subrev_i32_e32 v2, vcc, 1, v3
 ; GCN1-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v3
 ; GCN1-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN1-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9399,8 +9383,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN2-NEXT:  .LBB146_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v2, vcc, -1, v3
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN2-NEXT:    v_subrev_u32_e32 v2, vcc, 1, v3
 ; GCN2-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v3
 ; GCN2-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN2-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9427,9 +9410,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN3-NEXT:  .LBB146_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN3-NEXT:    v_subrev_co_u32_e32 v2, vcc, 1, v3
 ; GCN3-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v3
-; GCN3-NEXT:    v_add_u32_e32 v2, -1, v3
 ; GCN3-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
@@ -9463,8 +9445,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v5, v0
-; GCN1-NEXT:    v_add_i32_e32 v0, vcc, -1, v5
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN1-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v5
 ; GCN1-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v5
 ; GCN1-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9493,8 +9474,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v5, v0
-; GCN2-NEXT:    v_add_u32_e32 v0, vcc, -1, v5
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN2-NEXT:    v_subrev_u32_e32 v0, vcc, 1, v5
 ; GCN2-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v5
 ; GCN2-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9523,9 +9503,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v5, v0
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN3-NEXT:    v_subrev_co_u32_e32 v0, vcc, 1, v5
 ; GCN3-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v5
-; GCN3-NEXT:    v_add_u32_e32 v0, -1, v5
 ; GCN3-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v4, v0, v3, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
@@ -9557,8 +9536,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v5, v0
-; GCN1-NEXT:    v_add_i32_e32 v0, vcc, -1, v5
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN1-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v5
 ; GCN1-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v5
 ; GCN1-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9587,8 +9565,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v5, v0
-; GCN2-NEXT:    v_add_u32_e32 v0, vcc, -1, v5
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN2-NEXT:    v_subrev_u32_e32 v0, vcc, 1, v5
 ; GCN2-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v5
 ; GCN2-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9617,9 +9594,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v5, v0
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN3-NEXT:    v_subrev_co_u32_e32 v0, vcc, 1, v5
 ; GCN3-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v5
-; GCN3-NEXT:    v_add_u32_e32 v0, -1, v5
 ; GCN3-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v4, v0, v3, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[4:5] offset:16 glc
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index ffab56847edca..1a45bd978ccc1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -10195,8 +10195,7 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:  .LBB144_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -1, v4
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v4
 ; SI-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -10224,8 +10223,7 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:  .LBB144_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v4
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT:    v_subrev_u32_e32 v3, vcc, 1, v4
 ; VI-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; VI-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -10249,9 +10247,8 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:  .LBB144_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, 1, v4
 ; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v4
 ; GFX9-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
@@ -10282,8 +10279,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
 ; SI-NEXT:  .LBB145_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -1, v4
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v4
 ; SI-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -10313,8 +10309,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
 ; VI-NEXT:  .LBB145_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v4
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT:    v_subrev_u32_e32 v3, vcc, 1, v4
 ; VI-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; VI-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -10338,9 +10333,8 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
 ; GFX9-NEXT:  .LBB145_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, 1, v4
 ; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v4
 ; GFX9-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
@@ -10374,8 +10368,7 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -1, v5
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v5
 ; SI-NEXT:    v_cmp_gt_u32_e64 s[4:5], v5, v2
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
@@ -10403,8 +10396,7 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
-; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v4
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT:    v_subrev_u32_e32 v3, vcc, 1, v4
 ; VI-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
 ; VI-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -10429,9 +10421,8 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, 1, v4
 ; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v4
 ; GFX9-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
@@ -10464,8 +10455,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -1, v5
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v5
 ; SI-NEXT:    v_cmp_gt_u32_e64 s[4:5], v5, v2
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
@@ -10495,8 +10485,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NEXT:    v_subrev_u32_e32 v0, vcc, 1, v1
 ; VI-NEXT:    v_cmp_gt_u32_e64 s[4:5], v1, v2
 ; VI-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -10520,9 +10509,8 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, 1, v4
 ; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], v4, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v4
 ; GFX9-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
@@ -10560,8 +10548,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
 ; SI-NEXT:  .LBB148_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v1
 ; SI-NEXT:    v_cmp_lt_u32_e64 s[36:37], s34, v1
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[36:37]
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -10597,8 +10584,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
 ; VI-NEXT:  .LBB148_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 1, v3
 ; VI-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v3
 ; VI-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -10624,9 +10610,8 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
 ; GFX9-NEXT:  .LBB148_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, 1, v1
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v1
 ; GFX9-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
@@ -10663,8 +10648,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
 ; SI-NEXT:  .LBB149_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v1
 ; SI-NEXT:    v_cmp_lt_u32_e64 s[36:37], s34, v1
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[36:37]
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -10702,8 +10686,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
 ; VI-NEXT:  .LBB149_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 1, v3
 ; VI-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v3
 ; VI-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -10729,9 +10712,8 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
 ; GFX9-NEXT:  .LBB149_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, 1, v1
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v1
 ; GFX9-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
@@ -10771,8 +10753,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v4
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v4
 ; SI-NEXT:    v_cmp_lt_u32_e64 s[36:37], s34, v4
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[36:37]
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
@@ -10809,8 +10790,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v5
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT:    v_subrev_u32_e32 v0, vcc, 1, v5
 ; VI-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v5
 ; VI-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -10836,9 +10816,8 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, 1, v4
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v4
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v4
 ; GFX9-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[3:4], s[4:5] glc
@@ -10876,8 +10855,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v4
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v4
 ; SI-NEXT:    v_cmp_lt_u32_e64 s[36:37], s34, v4
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[36:37]
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
@@ -10914,8 +10892,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v5
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT:    v_subrev_u32_e32 v0, vcc, 1, v5
 ; VI-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v5
 ; VI-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -10941,9 +10918,8 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, 1, v4
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[34:35], s6, v4
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v4
 ; GFX9-NEXT:    s_or_b64 vcc, vcc, s[34:35]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[3:4], s[4:5] offset:16 glc
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 24a4d8fbde200..3c6a6a25c38ef 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -620,8 +620,7 @@ define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, 1, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s[4:5], 4, v1
 ; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 4, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index b1e05158b6212..64c7a6f03af7a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -377,63 +377,63 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
-; GFX8-NEXT:    s_movk_i32 s0, 0x7f
+; GFX8-NEXT:    v_mov_b32_e32 v13, 0x7f
 ; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    s_mov_b32 s0, 0
 ; GFX8-NEXT:  .LBB1_2: ; %for.body
 ; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffb000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffb800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[6:7]
+; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[6:7]
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffc000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffc800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffd000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 0xffffd800, v2
-; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, -1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 0xffffe000, v2
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, -1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffd800, v2
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe000, v2
+; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[4:5]
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[19:20]
-; GFX8-NEXT:    s_addk_i32 s1, 0x2000
-; GFX8-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[20:21]
+; GFX8-NEXT:    s_addk_i32 s0, 0x2000
+; GFX8-NEXT:    s_cmp_gt_u32 s0, 0x3fffff
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v13, v10
-; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, v14, v11, vcc
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v14, v10
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, v15, v11, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xffffe800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, -1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0xfffff000, v2
-; GFX8-NEXT:    flat_load_dwordx2 v[19:20], v[21:22]
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xfffff000, v2
+; GFX8-NEXT:    flat_load_dwordx2 v[20:21], v[22:23]
 ; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, -1, v3, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v15, v23
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, v16, v24, vcc
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0xfffff800, v2
-; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v16, v24
+; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, v17, v25, vcc
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xfffff800, v2
+; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[14:15]
+; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[16:17]
 ; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v17, v21
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, v18, v22, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v18, v22
+; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, v19, v23, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x10000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v21
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v22, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v22
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v23, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
@@ -441,30 +441,27 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v19, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v20, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v20, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v21, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v10, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v11, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v13, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v14, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v15, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v15, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v16, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v17, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v17, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v18, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v18, v4
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v19, v5, vcc
 ; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT:    s_add_i32 s1, s0, -1
-; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX8-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX8-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT:    s_mov_b32 s0, s1
-; GFX8-NEXT:    s_branch .LBB1_1
-; GFX8-NEXT:  .LBB1_5: ; %while.end
+; GFX8-NEXT:    v_subrev_u32_e32 v13, vcc, 1, v13
+; GFX8-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX8-NEXT:    s_cbranch_vccz .LBB1_1
+; GFX8-NEXT:  ; %bb.4: ; %while.end
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v12
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -503,7 +500,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    s_movk_i32 s5, 0x7f
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7f
 ; GFX900-NEXT:    s_movk_i32 s2, 0xd000
 ; GFX900-NEXT:    s_movk_i32 s3, 0xe000
 ; GFX900-NEXT:    s_movk_i32 s4, 0xf000
@@ -512,76 +509,73 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    ; Child Loop BB1_2 Depth 2
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    s_mov_b32 s6, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0
 ; GFX900-NEXT:  .LBB1_2: ; %for.body
 ; GFX900-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, 0xffffb000, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[2:3], off offset:-4096
-; GFX900-NEXT:    global_load_dwordx2 v[11:12], v[2:3], off offset:-2048
-; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, 0xffffc000, v2
-; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[7:8], off
-; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[19:20], v[13:14], off
-; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, s2, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, -1, v3, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, s3, v2
-; GFX900-NEXT:    global_load_dwordx2 v[15:16], v[15:16], off offset:-2048
-; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v3, vcc
-; GFX900-NEXT:    s_addk_i32 s6, 0x2000
-; GFX900-NEXT:    s_cmp_gt_u32 s6, 0x3fffff
+; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
+; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
+; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
+; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, s2, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v3, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, s3, v2
+; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX900-NEXT:    s_addk_i32 s5, 0x2000
+; GFX900-NEXT:    s_cmp_gt_u32 s5, 0x3fffff
 ; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e32 v21, vcc, v7, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v5, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[13:14], off offset:-4096
+; GFX900-NEXT:    v_add_co_u32_e32 v22, vcc, v8, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
 ; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e64 v23, s[0:1], v17, v21
-; GFX900-NEXT:    v_addc_co_u32_e64 v24, s[0:1], v18, v5, s[0:1]
-; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[21:22], v[13:14], off
+; GFX900-NEXT:    v_add_co_u32_e64 v24, s[0:1], v18, v22
+; GFX900-NEXT:    v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
+; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
+; GFX900-NEXT:    global_load_dwordx2 v[22:23], v[14:15], off
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
 ; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v23
-; GFX900-NEXT:    global_load_dwordx2 v[13:14], v[2:3], off
-; GFX900-NEXT:    v_addc_co_u32_e32 v20, vcc, v20, v24, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v24
+; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off
+; GFX900-NEXT:    v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
 ; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0x10000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, v15, v19
-; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, v16, v20, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v20
+; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v15
-; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v16, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v16
+; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v17, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v17, v7
-; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v18, v8, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, v18, v8
+; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, v19, v9, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v21, v7
-; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v22, v8, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, v22, v8
+; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, v23, v9, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v8, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v9, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v10, v5, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v12, v5, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v9, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v13, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v14, v5, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
 ; GFX900-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX900-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX900-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX900-NEXT:    s_add_i32 s0, s5, -1
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 0
-; GFX900-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX900-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX900-NEXT:    s_mov_b32 s5, s0
-; GFX900-NEXT:    s_branch .LBB1_1
-; GFX900-NEXT:  .LBB1_5: ; %while.end
+; GFX900-NEXT:    v_subrev_co_u32_e32 v7, vcc, 1, v7
+; GFX900-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX900-NEXT:    s_cbranch_vccz .LBB1_1
+; GFX900-NEXT:  ; %bb.4: ; %while.end
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v6
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -612,7 +606,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    s_movk_i32 s1, 0x7f
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0x7f
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 3, v6
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, s34
@@ -624,77 +618,74 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    ; Child Loop BB1_2 Depth 2
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:  .LBB1_2: ; %for.body
 ; GFX10-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v4, 0xffffb800
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, 0xffffc800
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v13, vcc_lo, v4, 0xffffd800
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v17, vcc_lo, v4, 0xffffe800
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, 0xffffb800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v4, 0xffffc800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v4, 0xffffd800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v18, vcc_lo, v4, 0xffffe800
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    global_load_dwordx2 v[11:12], v[7:8], off offset:-2048
-; GFX10-NEXT:    global_load_dwordx2 v[15:16], v[9:10], off offset:-2048
-; GFX10-NEXT:    global_load_dwordx2 v[19:20], v[13:14], off offset:-2048
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v21, vcc_lo, 0xfffff000, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[10:11], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v22, vcc_lo, 0xfffff000, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v23, vcc_lo, -1, v5, vcc_lo
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_dwordx2 v[23:24], v[17:18], off offset:-2048
-; GFX10-NEXT:    global_load_dwordx2 v[7:8], v[7:8], off
-; GFX10-NEXT:    global_load_dwordx2 v[9:10], v[9:10], off
-; GFX10-NEXT:    global_load_dwordx2 v[13:14], v[13:14], off
-; GFX10-NEXT:    global_load_dwordx2 v[25:26], v[17:18], off
-; GFX10-NEXT:    global_load_dwordx2 v[27:28], v[21:22], off
-; GFX10-NEXT:    global_load_dwordx2 v[29:30], v[4:5], off offset:-2048
-; GFX10-NEXT:    global_load_dwordx2 v[31:32], v[4:5], off
+; GFX10-NEXT:    global_load_dwordx2 v[24:25], v[18:19], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
+; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
+; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[14:15], off
+; GFX10-NEXT:    global_load_dwordx2 v[26:27], v[18:19], off
+; GFX10-NEXT:    global_load_dwordx2 v[28:29], v[22:23], off
+; GFX10-NEXT:    global_load_dwordx2 v[30:31], v[4:5], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[32:33], v[4:5], off
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x10000, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT:    s_addk_i32 s2, 0x2000
-; GFX10-NEXT:    s_cmp_gt_u32 s2, 0x3fffff
+; GFX10-NEXT:    s_addk_i32 s1, 0x2000
+; GFX10-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
 ; GFX10-NEXT:    s_waitcnt vmcnt(10)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v11, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v12, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v12, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v13, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v7, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v8, v3, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v15, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v16, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v8, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v9, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v16, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v17, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v9, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v10, v3, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v19, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v20, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v10, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v11, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v20, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v21, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v13, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v14, v3, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v23, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v24, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v14, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v15, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v24, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v25, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v25, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v26, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v26, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v27, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v27, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v28, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v28, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v29, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v29, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v30, v3, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v30, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v31, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v31, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v32, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v32, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v33, v3, vcc_lo
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX10-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX10-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX10-NEXT:    s_add_i32 s0, s1, -1
-; GFX10-NEXT:    s_cmp_eq_u32 s1, 0
-; GFX10-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_branch .LBB1_1
-; GFX10-NEXT:  .LBB1_5: ; %while.end
+; GFX10-NEXT:    v_sub_co_u32 v7, s0, v7, 1
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_vccz .LBB1_1
+; GFX10-NEXT:  ; %bb.4: ; %while.end
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, s34, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
@@ -731,7 +722,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0
-; GFX90A-NEXT:    s_movk_i32 s3, 0x7f
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x7f
 ; GFX90A-NEXT:    s_movk_i32 s0, 0xd000
 ; GFX90A-NEXT:    s_movk_i32 s1, 0xe000
 ; GFX90A-NEXT:    s_movk_i32 s2, 0xf000
@@ -739,7 +730,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB1_2 Depth 2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_mov_b32 s3, 0
 ; GFX90A-NEXT:  .LBB1_2: ; %for.body
 ; GFX90A-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -766,49 +757,46 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x10000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT:    s_addk_i32 s4, 0x2000
-; GFX90A-NEXT:    s_cmp_gt_u32 s4, 0x3fffff
+; GFX90A-NEXT:    s_addk_i32 s3, 0x2000
+; GFX90A-NEXT:    s_cmp_gt_u32 s3, 0x3fffff
 ; GFX90A-NEXT:    s_waitcnt vmcnt(8)
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v12, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(7)
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v18, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v19, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v18, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(6)
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v20, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v21, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v20, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(5)
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v16, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v17, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v16, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(4)
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v24, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v25, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v24, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(3)
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v26, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v27, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v26, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(2)
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v28, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v29, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v28, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(1)
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v14, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v15, v4, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v8, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v10, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX90A-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX90A-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX90A-NEXT:    s_add_i32 s4, s3, -1
-; GFX90A-NEXT:    s_cmp_eq_u32 s3, 0
-; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX90A-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX90A-NEXT:    s_mov_b32 s3, s4
-; GFX90A-NEXT:    s_branch .LBB1_1
-; GFX90A-NEXT:  .LBB1_5: ; %while.end
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v1, vcc, 1, v1
+; GFX90A-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX90A-NEXT:    s_cbranch_vccz .LBB1_1
+; GFX90A-NEXT:  ; %bb.4: ; %while.end
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -828,8 +816,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0
-; GFX11-NEXT:    s_movk_i32 s1, 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 3, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -843,95 +831,92 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    ; Child Loop BB1_2 Depth 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB1_2: ; %for.body
 ; GFX11-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v4, 0xffffc000
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v9, vcc_lo, 0xffffc000, v4
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v4, 0xffffc000
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0xffffc000, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, -1, v5, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[13:14], v[7:8], off offset:-4096
-; GFX11-NEXT:    v_add_co_u32 v11, vcc_lo, 0xffffd000, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, null, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v15, vcc_lo, v4, 0xffffe000
-; GFX11-NEXT:    global_load_b64 v[9:10], v[9:10], off offset:-2048
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v16, null, -1, v5, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[11:12], v[11:12], off offset:-2048
-; GFX11-NEXT:    v_add_co_u32 v17, vcc_lo, 0xffffe000, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, -1, v5, vcc_lo
+; GFX11-NEXT:    global_load_b64 v[14:15], v[8:9], off offset:-4096
+; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0xffffd000, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v13, null, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v16, vcc_lo, v4, 0xffffe000
+; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:-2048
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v17, null, -1, v5, vcc_lo
+; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:-2048
+; GFX11-NEXT:    v_add_co_u32 v18, vcc_lo, 0xffffe000, v4
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[19:20], v[15:16], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[7:8], v[7:8], off
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v18, null, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v21, vcc_lo, 0xfffff000, v4
+; GFX11-NEXT:    global_load_b64 v[20:21], v[16:17], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v19, null, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v22, vcc_lo, 0xfffff000, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v23, null, -1, v5, vcc_lo
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    global_load_b64 v[17:18], v[17:18], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[15:16], v[15:16], off
-; GFX11-NEXT:    global_load_b64 v[21:22], v[21:22], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[23:24], v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[25:26], v[4:5], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[27:28], v[4:5], off
+; GFX11-NEXT:    global_load_b64 v[18:19], v[18:19], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[16:17], v[16:17], off
+; GFX11-NEXT:    global_load_b64 v[22:23], v[22:23], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[24:25], v[4:5], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[26:27], v[4:5], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[28:29], v[4:5], off
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x10000, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT:    s_addk_i32 s2, 0x2000
-; GFX11-NEXT:    s_cmp_gt_u32 s2, 0x3fffff
+; GFX11-NEXT:    s_addk_i32 s1, 0x2000
+; GFX11-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v13, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v14, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v14, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v15, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v9, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v10, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v10, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v11, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v7, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v8, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v8, v3, s0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v11, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v3, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v12, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v12, v3, s0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v19, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v13, v3, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v20, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v20, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v21, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v17, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v18, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v18, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v19, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v15, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v16, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v17, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v21, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v22, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v22, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v23, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v23, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v24, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v24, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v25, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v25, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v26, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v26, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v27, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v27, v2
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v28, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v28, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v29, v3, vcc_lo
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX11-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX11-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX11-NEXT:    s_add_i32 s0, s1, -1
-; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX11-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_branch .LBB1_1
-; GFX11-NEXT:  .LBB1_5: ; %while.end
+; GFX11-NEXT:    v_sub_co_u32 v7, s0, v7, 1
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_vccz .LBB1_1
+; GFX11-NEXT:  ; %bb.4: ; %while.end
 ; GFX11-NEXT:    v_add_co_u32 v0, s0, s34, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s35, 0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 0b58b328bbfb6..68c33487b0596 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -67,9 +67,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    v_sad_u32 v2, s0, v0, v1
+; GCN-NEXT:    v_sad_u32 v2, s1, v0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
@@ -249,10 +249,10 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_addc_u32 s21, s21, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NEXT:    v_sad_u32 v3, s0, v0, v1
+; GCN-NEXT:    v_sad_u32 v3, s1, v0, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[20:23], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -284,8 +284,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    s_add_u32 s20, s20, s17
 ; GCN-NEXT:    s_addc_u32 s21, s21, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_min_u32 s3, s0, s1
-; GCN-NEXT:    s_max_u32 s0, s0, s1
+; GCN-NEXT:    s_min_u32 s3, s1, s0
+; GCN-NEXT:    s_max_u32 s0, s1, s0
 ; GCN-NEXT:    s_sub_i32 s0, s0, s3
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
@@ -583,17 +583,17 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_sub_i32 s3, s0, s3
-; GCN-NEXT:    s_sub_i32 s6, s1, s0
-; GCN-NEXT:    s_cmp_lt_u32 s1, s0
-; GCN-NEXT:    s_cselect_b32 s0, s3, s6
-; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, s2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b