[llvm] c8db069 - AMDGPU: Use common check prefix in atomic expand test

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 19 06:46:27 PDT 2024


Author: Matt Arsenault
Date: 2024-04-19T15:46:01+02:00
New Revision: c8db069253a8783ebce0eb6c244551b0b42e997e

URL: https://github.com/llvm/llvm-project/commit/c8db069253a8783ebce0eb6c244551b0b42e997e
DIFF: https://github.com/llvm/llvm-project/commit/c8db069253a8783ebce0eb6c244551b0b42e997e.diff

LOG: AMDGPU: Use common check prefix in atomic expand test

Added: 
    

Modified: 
    llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 9010dec31c1610..8c7d8f5be88e75 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=CI %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GFX9 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefix=GFX908 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefix=GFX90A %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefix=GFX940 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefix=GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,CI %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX11 %s
 
 define void @test_atomicrmw_fadd_f32_global_no_use_unsafe(ptr addrspace(1) %ptr, float %value) #0 {
 ; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use_unsafe(
@@ -1179,698 +1179,133 @@ define float @test_atomicrmw_fadd_f32_local(ptr addrspace(3) %ptr, float %value)
 }
 
 define half @test_atomicrmw_fadd_f16_flat(ptr %ptr, half %value) {
-; CI-LABEL: @test_atomicrmw_fadd_f16_flat(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; CI-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; CI-NEXT:    ret half [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_f16_flat(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX9-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX9-NEXT:    ret half [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_f16_flat(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX908-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX908-NEXT:    ret half [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_f16_flat(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX90A-NEXT:    ret half [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_f16_flat(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX940-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX940-NEXT:    ret half [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_f16_flat(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX11-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX11-NEXT:    ret half [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_f16_flat(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; ALL-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; ALL-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
+; ALL-NEXT:    ret half [[TMP7]]
 ;
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst
   ret half %res
 }
 
 define half @test_atomicrmw_fadd_f16_global(ptr addrspace(1) %ptr, half %value) {
-; CI-LABEL: @test_atomicrmw_fadd_f16_global(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; CI-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; CI-NEXT:    ret half [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_f16_global(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX9-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX9-NEXT:    ret half [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_f16_global(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX908-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX908-NEXT:    ret half [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_f16_global(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX90A-NEXT:    ret half [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_f16_global(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX940-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX940-NEXT:    ret half [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_f16_global(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX11-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX11-NEXT:    ret half [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_f16_global(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; ALL-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; ALL-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
+; ALL-NEXT:    ret half [[TMP7]]
 ;
   %res = atomicrmw fadd ptr addrspace(1) %ptr, half %value seq_cst
   ret half %res
 }
 
 define half @test_atomicrmw_fadd_f16_global_align4(ptr addrspace(1) %ptr, half %value) {
-; CI-LABEL: @test_atomicrmw_fadd_f16_global_align4(
-; CI-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CI-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; CI-NEXT:    [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP3:%.*]] = bitcast half [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; CI-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
-; CI-NEXT:    ret half [[TMP5]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_f16_global_align4(
-; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX9-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX9-NEXT:    [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP3:%.*]] = bitcast half [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX9-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
-; GFX9-NEXT:    ret half [[TMP5]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_f16_global_align4(
-; GFX908-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX908-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX908-NEXT:    [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP3:%.*]] = bitcast half [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
-; GFX908-NEXT:    ret half [[TMP5]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_f16_global_align4(
-; GFX90A-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast half [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
-; GFX90A-NEXT:    ret half [[TMP5]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_f16_global_align4(
-; GFX940-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX940-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX940-NEXT:    [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP3:%.*]] = bitcast half [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX940-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
-; GFX940-NEXT:    ret half [[TMP5]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_f16_global_align4(
-; GFX11-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX11-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX11-NEXT:    [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP3:%.*]] = bitcast half [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
-; GFX11-NEXT:    ret half [[TMP5]]
+; ALL-LABEL: @test_atomicrmw_fadd_f16_global_align4(
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; ALL-NEXT:    [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast half [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
+; ALL-NEXT:    ret half [[TMP5]]
 ;
   %res = atomicrmw fadd ptr addrspace(1) %ptr, half %value seq_cst, align 4
   ret half %res
 }
 
 define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
-; CI-LABEL: @test_atomicrmw_fadd_f16_local(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; CI-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; CI-NEXT:    ret half [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX9-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX9-NEXT:    ret half [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX908-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX908-NEXT:    ret half [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX90A-NEXT:    ret half [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX940-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX940-NEXT:    ret half [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
-; GFX11-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
-; GFX11-NEXT:    ret half [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_f16_local(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; ALL-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
+; ALL-NEXT:    ret half [[TMP7]]
 ;
   %res = atomicrmw fadd ptr addrspace(3) %ptr, half %value seq_cst
   ret half %res
@@ -2481,1789 +1916,344 @@ define float @test_atomicrmw_fadd_f32_local_strictfp(ptr addrspace(3) %ptr, floa
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_local(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_local(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP7]]
 ;
   %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_local_align4(ptr addrspace(3) %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_local_align4(
-; CI-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CI-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; CI-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_align4(
-; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX9-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX9-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_align4(
-; GFX908-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX908-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_align4(
-; GFX90A-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_align4(
-; GFX940-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX940-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX940-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_align4(
-; GFX11-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX11-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP5]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_local_align4(
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP5]]
 ;
   %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic, align 4
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_global_agent(ptr addrspace(1) %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_global_agent(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_global_agent(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_global_agent(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_global_agent(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_global_agent(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_global_agent(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_global_agent(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; ALL-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP7]]
 ;
   %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value syncscope("agent") monotonic
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_global_agent_align4(ptr addrspace(1) %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4(
-; CI-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CI-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; CI-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4(
-; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX9-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX9-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4(
-; GFX908-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX908-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4(
-; GFX90A-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4(
-; GFX940-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX940-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX940-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4(
-; GFX11-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX11-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP5]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4(
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP5]]
 ;
   %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value syncscope("agent") monotonic, align 4
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_global_system(ptr addrspace(1) %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_global_system(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_global_system(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_global_system(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_global_system(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_global_system(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_global_system(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_global_system(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; ALL-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP7]]
 ;
   %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value monotonic
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4(
-; CI-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CI-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; CI-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4(
-; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX9-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX9-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4(
-; GFX908-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX908-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4(
-; GFX90A-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4(
-; GFX940-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX940-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX940-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4(
-; GFX11-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX11-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP5]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4(
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP5]]
 ;
   %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value monotonic, align 4
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]]
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]]
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]]
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6:[0-9]+]]
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6:[0-9]+]]
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]]
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6:[0-9]+]]
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP7]]
 ;
   %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_flat_agent(ptr %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_flat_agent(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_flat_agent(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_flat_agent(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_flat_agent(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_flat_agent(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_flat_agent(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_flat_agent(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; ALL-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP7]]
 ;
   %res = atomicrmw fadd ptr %ptr, bfloat %value syncscope("agent") monotonic
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_flat_agent_align4(ptr %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4(
-; CI-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CI-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; CI-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4(
-; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX9-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX9-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4(
-; GFX908-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX908-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4(
-; GFX90A-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4(
-; GFX940-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX940-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX940-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4(
-; GFX11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX11-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP5]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4(
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP5]]
 ;
   %res = atomicrmw fadd ptr %ptr, bfloat %value syncscope("agent") monotonic, align 4
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_flat_system(ptr %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_flat_system(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_flat_system(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_flat_system(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_flat_system(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_flat_system(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP7]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_flat_system(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
-; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
-; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP7]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_flat_system(
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; ALL-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP7]]
 ;
   %res = atomicrmw fadd ptr %ptr, bfloat %value monotonic
   ret bfloat %res
 }
 
 define bfloat @test_atomicrmw_fadd_bf16_flat_system_align4(ptr %ptr, bfloat %value) {
-; CI-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4(
-; CI-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CI:       atomicrmw.start:
-; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CI-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; CI-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; CI-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CI:       atomicrmw.end:
-; CI-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; CI-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; CI-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX9-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4(
-; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX9:       atomicrmw.start:
-; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX9-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX9-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX9-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX9-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX9-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX9-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX908-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4(
-; GFX908-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX908:       atomicrmw.start:
-; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX908-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX908-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX908-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX908-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4(
-; GFX90A-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX90A:       atomicrmw.start:
-; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX90A-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX940-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4(
-; GFX940-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX940:       atomicrmw.start:
-; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX940-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX940-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX940-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX940-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX940-NEXT:    ret bfloat [[TMP5]]
-;
-; GFX11-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4(
-; GFX11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; GFX11:       atomicrmw.start:
-; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; GFX11-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
-; GFX11-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
-; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
-; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
-; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
-; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
-; GFX11-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
-; GFX11-NEXT:    ret bfloat [[TMP5]]
+; ALL-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4(
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
+; ALL-NEXT:    ret bfloat [[TMP5]]
 ;
   %res = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 4
   ret bfloat %res


        


More information about the llvm-commits mailing list