[llvm] [AMDGPU] Fix SDWA commuting (PR #106920)
Yaxun Liu via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 4 10:57:39 PDT 2024
https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/106920
>From ccc3ecad74173d6599a9d89abeab5db24fb384ce Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Sun, 1 Sep 2024 10:12:19 -0400
Subject: [PATCH] [AMDGPU] Fix SDWA commuting
SDWA insts miss reverse opcode, which causes them to be treated as commutable
with default reverse opcode i.e. their own opcode. As a result, SWDA F16
sub A, B and Sub B, A are merged by machine CSE. The correct behavior is
to merged sub A, B and subrev B, A instead of sub B, A. This issues caused
failures in rocFFT tests.
Another issue is that src0_sel and src1_sel are not swapped when SDWA insts
are commuted.
Verified that this fixes rocFFT tests failure.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 13 ++--
llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 17 +++++
llvm/test/CodeGen/AMDGPU/sdwa-commute.ll | 46 ++++++++++++
llvm/test/CodeGen/AMDGPU/sdwa-cse.mir | 78 +++++++++++++++++++++
5 files changed, 152 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-cse.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b379447dd8d4c8..0d153df5c3977c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2788,6 +2788,9 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
Src1, AMDGPU::OpName::src1_modifiers);
+ swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
+ AMDGPU::OpName::src1_sel);
+
CommutedMI->setDesc(get(CommutedOpcode));
}
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index cdc32149249610..639f9189cbe7e3 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -167,9 +167,11 @@ multiclass VOP2Inst_e64<string opName,
}
multiclass VOP2Inst_sdwa<string opName,
- VOPProfile P> {
+ VOPProfile P,
+ string revOp = opName> {
if P.HasExtSDWA then
- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
}
multiclass VOP2Inst<string opName,
@@ -178,7 +180,7 @@ multiclass VOP2Inst<string opName,
string revOp = opName> :
VOP2Inst_e32<opName, P, node, revOp>,
VOP2Inst_e64<opName, P, node, revOp>,
- VOP2Inst_sdwa<opName, P> {
+ VOP2Inst_sdwa<opName, P, revOp> {
if P.HasExtDPP then
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
@@ -225,7 +227,7 @@ multiclass VOP2Inst_VOPD<string opName,
string revOp = opName> :
VOP2Inst_e32_VOPD<opName, P, VOPDOp, VOPDName, node, revOp>,
VOP2Inst_e64<opName, P, node, revOp>,
- VOP2Inst_sdwa<opName, P> {
+ VOP2Inst_sdwa<opName, P, revOp> {
if P.HasExtDPP then
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
@@ -243,7 +245,8 @@ multiclass VOP2bInst <string opName,
}
if P.HasExtSDWA then
- def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> {
let AsmMatchConverter = "cvtSdwaVOP2b";
}
if P.HasExtDPP then
diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
new file mode 100644
index 00000000000000..b9397f9d5d4ddc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
@@ -0,0 +1,17 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s
+
+# GCN-LABEL: name: test_machine_cse_op_sel
+# GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
+# GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
+# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
+---
+name: test_machine_cse_op_sel
+body: |
+ bb.0:
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
+ %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
new file mode 100644
index 00000000000000..94bc6d46b2395b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s
+
+%ret_struct = type { half, half }
+
+define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrspace(3) %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) {
+; CHECK-LABEL: extracted_values:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b32 v3, v3
+; CHECK-NEXT: ds_read_b32 v4, v4
+; CHECK-NEXT: ds_read_b32 v2, v2
+; CHECK-NEXT: ds_read_b32 v5, v5
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: v_sub_f16_sdwa v6, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; CHECK-NEXT: v_sub_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_sub_f16_sdwa v7, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; CHECK-NEXT: v_add_f16_e32 v4, v6, v7
+; CHECK-NEXT: v_add_f16_e32 v2, v3, v2
+; CHECK-NEXT: flat_store_short v[0:1], v4
+; CHECK-NEXT: flat_store_short v[0:1], v2 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %tmp0 = load <2 x half>, ptr addrspace(3) %arg1, align 4
+ %tmp1 = extractelement <2 x half> %tmp0, i64 1
+ %tmp2 = load <2 x half>, ptr addrspace(3) %arg2, align 4
+ %tmp3 = extractelement <2 x half> %tmp2, i64 1
+ %tmp4 = fsub contract half %tmp1, %tmp3
+ %tmp5 = load <2 x half>, ptr addrspace(3) %arg0, align 4
+ %tmp6 = extractelement <2 x half> %tmp5, i64 1
+ %tmp7 = load <2 x half>, ptr addrspace(3) %arg3, align 4
+ %tmp8 = extractelement <2 x half> %tmp7, i64 1
+ %tmp9 = fsub contract half %tmp6, %tmp8
+ %tmp10 = fadd contract half %tmp4, %tmp9
+ %tmp11 = fsub contract half %tmp3, %tmp1
+ %tmp12 = fsub contract half %tmp8, %tmp6
+ %tmp13 = fadd contract half %tmp11, %tmp12
+ %field_ptr = getelementptr %ret_struct, ptr %ret_struct, i32 0, i32 0
+ store half %tmp10, ptr %field_ptr, align 2
+ %field_ptr1 = getelementptr %ret_struct, ptr %ret_struct, i32 0, i32 1
+ store half %tmp13, ptr %field_ptr1, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-cse.mir b/llvm/test/CodeGen/AMDGPU/sdwa-cse.mir
new file mode 100644
index 00000000000000..147a6f30253ee9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-cse.mir
@@ -0,0 +1,78 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s
+
+# GCN-LABEL: name: test_machine_cse_subtraction_sdwa_f16_no_merge
+# GCN: %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
+# GCN: %3:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %1.sub0, 0, %0.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
+# GCN: %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
+# GCN: %6:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
+# GCN: DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
+---
+name: test_machine_cse_subtraction_sdwa_f16_no_merge
+body: |
+ bb.0:
+ %0:vreg_64 = IMPLICIT_DEF
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
+ %3:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %1.sub0, 0, %0.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
+ %6:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
+...
+
+# GCN-LABEL: name: test_machine_cse_subtraction_sdwa_f16_merge_same_src_sel
+# GCN: %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
+# GCN: %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
+# GCN: DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %5, 0, 1, 0, implicit $exec
+---
+name: test_machine_cse_subtraction_sdwa_f16_merge_same_src_sel
+body: |
+ bb.0:
+ %0:vreg_64 = IMPLICIT_DEF
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
+ %3:vgpr_32 = contract nofpexcept V_SUBREV_F16_sdwa 0, %1.sub0, 0, %0.sub0, 0, 0, 6, 0, 5, 5, implicit $mode, implicit $exec
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
+ %6:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
+...
+
+# GCN-LABEL: name: test_machine_cse_subtraction_sdwa_f16_merge_diff_src_sel
+# GCN: %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec
+# GCN: %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
+# GCN: DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %5, 0, 1, 0, implicit $exec
+---
+name: test_machine_cse_subtraction_sdwa_f16_merge_diff_src_sel
+body: |
+ bb.0:
+ %0:vreg_64 = IMPLICIT_DEF
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec
+ %3:vgpr_32 = contract nofpexcept V_SUBREV_F16_sdwa 0, %1.sub0, 0, %0.sub0, 0, 0, 6, 0, 5, 6, implicit $mode, implicit $exec
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
+ %6:vgpr_32 = contract nofpexcept V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
+...
+
+# GCN-LABEL: name: test_machine_cse_subtraction_sdwa_f16_op_sel
+# GCN: %2:vgpr_32 = V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, op_sel=0, src_sel1=5, src_sel2=5, implicit $mode, implicit $exec
+# GCN: %3:vgpr_32 = V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, op_sel=1, src_sel1=5, src_sel2=5, implicit $mode, implicit $exec
+# GCN: %5:vgpr_32 = V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
+# GCN: %6:vgpr_32 = V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
+# GCN: DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
+---
+name: test_machine_cse_subtraction_sdwa_f16_op_sel
+body: |
+ bb.0:
+ %0:vreg_64 = IMPLICIT_DEF
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:vgpr_32 = V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, op_sel=0, src_sel1=5, src_sel2=5, implicit $mode, implicit $exec
+ %3:vgpr_32 = V_SUB_F16_sdwa 0, %0.sub0, 0, %1.sub0, 0, op_sel=1, src_sel1=5, src_sel2=5, implicit $mode, implicit $exec
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = V_ADD_F16_e32 %2, %4, implicit $mode, implicit $exec
+ %6:vgpr_32 = V_ADD_F16_e32 %3, %4, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %7:vgpr_32, %5, %6, 0, 1, 0, implicit $exec
+...
+
More information about the llvm-commits
mailing list