[llvm] [AMDGPU] gfx1250 V_{MIN|MAX}_{I|U}64 opcodes (PR #151256)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 29 16:51:11 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
---
Patch is 141.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151256.diff
13 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+24-7)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+12-4)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+11)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll (+192)
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll (+136-168)
- (modified) llvm/test/CodeGen/AMDGPU/max.ll (+320)
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+531)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+48)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+48)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s (+44)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+48)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f16351fac9e2e..44cad8a862141 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
if (ST.hasVOP3PInsts()) {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElements(0, S16, 2)
- .minScalar(0, S16)
- .widenScalarToNextPow2(0)
- .scalarize(0)
- .lower();
+ getActionDefinitionsBuilder(G_ABS)
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ if (ST.hasIntMinMax64()) {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, S64, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ } else {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ }
} else {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 306443d25a74f..6c40eb5aa7e6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4009,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
- case AMDGPU::G_SMIN:
- case AMDGPU::G_SMAX:
- case AMDGPU::G_UMIN:
- case AMDGPU::G_UMAX:
case AMDGPU::G_ABS:
case AMDGPU::G_SHUFFLE_VECTOR:
case AMDGPU::G_SBFX:
@@ -4022,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX:
+ if (isSALUMapping(MI)) {
+ // There are no scalar 64-bit min and max, use vector instruction instead.
+ if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 &&
+ Subtarget.hasIntMinMax64())
+ return getDefaultMappingVOP(MI);
+ return getDefaultMappingSOP(MI);
+ }
+ return getDefaultMappingVOP(MI);
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMUL:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b824c66931288..fba1c5a0f05e1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1532,6 +1532,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// instructions.
bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
+ // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
+ bool hasIntMinMax64() const { return GFX1250Insts; }
+
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fbaf9bc452790..579ca96a76200 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
Custom);
}
+ if (Subtarget->hasIntMinMax64())
+ setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64,
+ Legal);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 2fef4f029951b..5586dd872fef5 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -202,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f
} // End SchedRW = [WriteDoubleAdd]
} // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in {
+defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>;
+defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>;
+defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>;
+defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>;
+} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd]
+
} // End isReMaterializable = 1
let Uses = [MODE, VCC, EXEC] in {
@@ -1810,6 +1817,10 @@ defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>;
defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>;
defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>;
defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>;
+defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>;
+defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>;
+defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>;
+defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>;
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
new file mode 100644
index 0000000000000..43c8f46f98cfc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck %s
+
+declare i64 @llvm.umin.i64(i64, i64)
+declare i64 @llvm.umax.i64(i64, i64)
+declare i64 @llvm.smin.i64(i64, i64)
+declare i64 @llvm.smax.i64(i64, i64)
+declare i64 @llvm.abs.i64(i64, i1)
+
+declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>)
+declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>)
+declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
+declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>)
+
+define i64 @test_umin_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: test_umin_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+ ret i64 %r
+}
+
+define i64 @test_umax_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: test_umax_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call i64 @llvm.umax.i64(i64 %a, i64 %b)
+ ret i64 %r
+}
+
+define i64 @test_smin_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: test_smin_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call i64 @llvm.smin.i64(i64 %a, i64 %b)
+ ret i64 %r
+}
+
+define i64 @test_smax_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: test_smax_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call i64 @llvm.smax.i64(i64 %a, i64 %b)
+ ret i64 %r
+}
+
+define <4 x i64> @test_umin_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_umin_v4i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_min_u64 v[0:1], v[0:1], v[8:9]
+; CHECK-NEXT: v_min_u64 v[2:3], v[2:3], v[10:11]
+; CHECK-NEXT: v_min_u64 v[4:5], v[4:5], v[12:13]
+; CHECK-NEXT: v_min_u64 v[6:7], v[6:7], v[14:15]
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %a, <4 x i64> %b)
+ ret <4 x i64> %r
+}
+
+define <4 x i64> @test_umax_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_umax_v4i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_max_u64 v[0:1], v[0:1], v[8:9]
+; CHECK-NEXT: v_max_u64 v[2:3], v[2:3], v[10:11]
+; CHECK-NEXT: v_max_u64 v[4:5], v[4:5], v[12:13]
+; CHECK-NEXT: v_max_u64 v[6:7], v[6:7], v[14:15]
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %a, <4 x i64> %b)
+ ret <4 x i64> %r
+}
+
+define <4 x i64> @test_smin_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_smin_v4i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_min_i64 v[0:1], v[0:1], v[8:9]
+; CHECK-NEXT: v_min_i64 v[2:3], v[2:3], v[10:11]
+; CHECK-NEXT: v_min_i64 v[4:5], v[4:5], v[12:13]
+; CHECK-NEXT: v_min_i64 v[6:7], v[6:7], v[14:15]
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %a, <4 x i64> %b)
+ ret <4 x i64> %r
+}
+
+define <4 x i64> @test_smax_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_smax_v4i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_max_i64 v[0:1], v[0:1], v[8:9]
+; CHECK-NEXT: v_max_i64 v[2:3], v[2:3], v[10:11]
+; CHECK-NEXT: v_max_i64 v[4:5], v[4:5], v[12:13]
+; CHECK-NEXT: v_max_i64 v[6:7], v[6:7], v[14:15]
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %a, <4 x i64> %b)
+ ret <4 x i64> %r
+}
+
+define i64 @test_abs_i64(i64 %a) {
+; CHECK-LABEL: test_abs_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT: s_set_pc_i64 s[30:31]
+ %r = call i64 @llvm.abs.i64(i64 %a, i1 0)
+ ret i64 %r
+}
+
+define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) {
+; CHECK-LABEL: test_umin_i64_s:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_min_u64 v[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: ; return to shader part epilog
+ %r = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+ ret i64 %r
+}
+
+define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) {
+; CHECK-LABEL: test_umax_i64_s:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_max_u64 v[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: ; return to shader part epilog
+ %r = call i64 @llvm.umax.i64(i64 %a, i64 %b)
+ ret i64 %r
+}
+
+define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) {
+; CHECK-LABEL: test_smin_i64_s:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_min_i64 v[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: ; return to shader part epilog
+ %r = call i64 @llvm.smin.i64(i64 %a, i64 %b)
+ ret i64 %r
+}
+
+define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) {
+; CHECK-LABEL: test_smax_i64_s:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_max_i64 v[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: ; return to shader part epilog
+ %r = call i64 @llvm.smax.i64(i64 %a, i64 %b)
+ ret i64 %r
+}
+
+define amdgpu_ps i64 @test_abs_i64_s(i64 inreg %a) {
+; CHECK-LABEL: test_abs_i64_s:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_ashr_i32 s2, s1, 31
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: s_mov_b32 s3, s2
+; CHECK-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
+ %r = call i64 @llvm.abs.i64(i64 %a, i1 0)
+ ret i64 %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index bd4ee037a7c6c..2ff66c9b9017a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -281,7 +281,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
-; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -322,7 +322,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
-; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off
+; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -370,7 +370,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
-; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -414,7 +414,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
-; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off
+; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -458,7 +458,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off
+; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
@@ -491,7 +491,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
-; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
+; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -531,7 +531,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off
+; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128:
@@ -567,7 +567,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
-; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
+; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -673,7 +673,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB18_5
@@ -715,7 +715,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB18_5
@@ -764,7 +764,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB19_5
@@ -809,7 +809,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB19_5
@@ -855,7 +855,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/151256
More information about the llvm-commits
mailing list