[llvm] [AMDGPU] CodeGen for GFX12 64-bit scalar add/sub (PR #75070)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 11 08:55:19 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
- [AMDGPU] Generate add/sub checks and add GFX12 coverage
- [AMDGPU] CodeGen for GFX12 64-bit scalar add/sub
---
Patch is 82.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75070.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+17-7)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+2)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+39-28)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll (+107)
- (modified) llvm/test/CodeGen/AMDGPU/add.ll (+1321-74)
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+229-3)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ccdbd3216e2604..2cf804e3348e80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -681,13 +681,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
// Full set of gfx9 features.
- getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
- .scalarize(0)
- .minScalar(0, S16)
- .widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32);
+ if (ST.hasScalarAddSub64()) {
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
+ .legalFor({S64, S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
+ } else {
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
+ }
getActionDefinitionsBuilder(G_MUL)
.legalFor({S32, S16, V2S16})
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6f..2eed024af60ae6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -677,6 +677,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return AddNoCarryInsts;
}
+ bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
+
bool hasUnpackedD16VMem() const {
return HasUnpackedD16VMem;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..84da45a8ed2e37 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4555,40 +4555,51 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
+ // For GFX12, we emit s_add_u64 and s_sub_u64.
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
-
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
-
- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-
- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
-
- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
+ if (Subtarget->hasScalarAddSub64()) {
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
+ .addReg(Src0.getReg())
+ .addReg(Src1.getReg());
+ } else {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+ .add(Src0Sub0)
+ .add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+ .add(Src0Sub1)
+ .add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ }
MI.eraseFromParent();
return BB;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
new file mode 100644
index 00000000000000..b850c37c4a2810
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+; GFX11-LABEL: s_add_u64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s0, s6, s0
+; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_add_u64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+entry:
+ %add = add i64 %a, %b
+ store i64 %add, i64 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+; GCN-LABEL: v_add_u64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+entry:
+ %add = add i64 %a, %b
+ store i64 %add, i64 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+; GFX11-LABEL: s_sub_u64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s0, s6, s0
+; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_sub_u64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+entry:
+ %sub = sub i64 %a, %b
+ store i64 %sub, i64 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+; GCN-LABEL: v_sub_u64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4
+; GCN-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+entry:
+ %sub = sub i64 %a, %b
+ store i64 %sub, i64 addrspace(1)* %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 4b6891e7aa20d5..34a676bffcfe31 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -1,14 +1,89 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,FUNC %s
-
-; FUNC-LABEL: {{^}}s_add_i32:
-; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
-; GCN: v_mov_b32_e32 v[[V_REG:[0-9]+]], s[[REG]]
-; GCN: buffer_store_{{dword|b32}} v[[V_REG]],
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+
define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GFX6-LABEL: s_add_i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_add_i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_i32 s0, s2, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_add_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s2, s2, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_add_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_i32 s2, s2, s3
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_add_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s2, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_add_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_add_co_i32 s2, s2, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
%a = load i32, ptr addrspace(1) %in
%b = load i32, ptr addrspace(1) %b_ptr
@@ -17,10 +92,96 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
ret void
}
-; FUNC-LABEL: {{^}}s_add_v2i32:
-; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GFX6-LABEL: s_add_v2i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s5, s7
+; GFX6-NEXT: s_add_i32 s4, s4, s6
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_add_v2i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_i32 s0, s5, s7
+; GFX8-NEXT: s_add_i32 s1, s4, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_add_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s2, s5, s7
+; GFX9-NEXT: s_add_i32 s3, s4, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_add_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_i32 s2, s4, s6
+; GFX10-NEXT: s_add_i32 s3, s5, s7
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_add_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s2, s4, s6
+; GFX11-NEXT: s_add_i32 s3, s5, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_add_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_add_co_i32 s2, s4, s6
+; GFX12-NEXT: s_add_co_i32 s3, s5, s7
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
%a = load <2 x i32>, ptr addrspace(1) %in
%b = load <2 x i32>, ptr addrspace(1) %b_ptr
@@ -29,12 +190,118 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}s_add_v4i32:
-; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GFX6-LABEL: s_add_v4i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX6-NEXT: s_mov_b32 s11, 0xf000
+; GFX6-NEXT: s_mov_b32 s10, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s3, s3, s7
+; GFX6-NEXT: s_add_i32 s2, s2, s6
+; GFX6-NEXT: s_add_i32 s1, s1, s5
+; GFX6-NEXT: s_add_i32 s0, s0, s4
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_mov_b32_e32 v3, s3
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_add_v4i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GF...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/75070
More information about the llvm-commits
mailing list