[llvm] [AMDGPU] Added isCommutable attribute to V_ADD_NC_U16 (PR #111789)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 21 21:50:40 PDT 2024
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/111789
>From 1caeb5d78537379b9166bf7635db2b3864a73cb7 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 7 Oct 2024 12:03:22 +0530
Subject: [PATCH 1/2] added swap for imm values and global values
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 47 ++++++++++++++++++++++++--
1 file changed, 45 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0d153df5c3977c..38970a7a0ef86b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2742,6 +2742,50 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
return &MI;
}
+static MachineInstr *swapNonRegOperands(MachineInstr &MI,
+ MachineOperand &NonRegOp1,
+ MachineOperand &NonRegOp2) {
+ if (NonRegOp1.isImm() && NonRegOp2.isImm()){
+ auto TargetFlags = NonRegOp1.getTargetFlags();
+ auto NonRegVal = NonRegOp1.getImm();
+
+ NonRegOp1.setImm(NonRegOp2.getImm());
+ NonRegOp2.setImm(NonRegVal);
+ NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
+ NonRegOp2.setTargetFlags(TargetFlags);
+ }
+ // --> Still working on the FrameInfo case :)
+ // else if (NonRegOp1.isFI() && NonRegOp2.isFI()){
+ // auto TargetFlags = NonRegOp 1.getTargetFlags();
+ // auto FrameIndex = NonRegOp1.getIndex();
+ // NonRegOp1.ChangeToFrameIndex(NonRegOp2.getIndex());
+ // NonRegOp2.ChangeToFrameIndex(FrameIndex);
+ // NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
+ // NonRegOp2.setTargetFlags(TargetFlags);
+ // }
+ else if (NonRegOp1.isGlobal() && NonRegOp2.isImm()){
+ auto TargetFlags = NonRegOp1.getTargetFlags();
+ auto GlobalVal = NonRegOp1.getGlobal();
+ auto GlobalOffset = NonRegOp1.getOffset();
+ NonRegOp1.ChangeToImmediate(NonRegOp2.getImm());
+ NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
+ NonRegOp2.ChangeToGA(GlobalVal, GlobalOffset, TargetFlags);
+ NonRegOp2.setTargetFlags(TargetFlags);
+ }
+ else if (NonRegOp1.isImm() && NonRegOp2.isGlobal()){
+ auto TargetFlags = NonRegOp2.getTargetFlags();
+ auto GlobalVal = NonRegOp2.getGlobal();
+ auto GlobalOffset = NonRegOp2.getOffset();
+ NonRegOp2.ChangeToImmediate(NonRegOp1.getImm());
+ NonRegOp2.setTargetFlags(NonRegOp1.getTargetFlags());
+ NonRegOp1.ChangeToGA(GlobalVal, GlobalOffset, TargetFlags);
+ NonRegOp1.setTargetFlags(TargetFlags);
+ }
+ else
+ return nullptr;
+ return &MI;
+}
+
MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned Src0Idx,
unsigned Src1Idx) const {
@@ -2780,8 +2824,7 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
if (isOperandLegal(MI, Src1Idx, &Src0))
CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
} else {
- // FIXME: Found two non registers to commute. This does happen.
- return nullptr;
+ CommutedMI = swapNonRegOperands(MI, Src1, Src0);
}
if (CommutedMI) {
>From 7f0f063fb32762a635306733df285adb670a8361 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Sun, 20 Oct 2024 13:40:33 +0530
Subject: [PATCH 2/2] Marked V_MAD_*/V_FMA_* as commutable.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 29 +----------------
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 24 ++++++++++----
llvm/test/CodeGen/AMDGPU/cmp_shrink.mir | 2 +-
llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 10 +++---
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 8 ++---
.../eliminate-frame-index-v-add-co-u32.mir | 8 ++---
.../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 32 +++++++++----------
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 2 +-
8 files changed, 50 insertions(+), 65 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 38970a7a0ef86b..9fb68b602544c3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2754,33 +2754,6 @@ static MachineInstr *swapNonRegOperands(MachineInstr &MI,
NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
NonRegOp2.setTargetFlags(TargetFlags);
}
- // --> Still working on the FrameInfo case :)
- // else if (NonRegOp1.isFI() && NonRegOp2.isFI()){
- // auto TargetFlags = NonRegOp 1.getTargetFlags();
- // auto FrameIndex = NonRegOp1.getIndex();
- // NonRegOp1.ChangeToFrameIndex(NonRegOp2.getIndex());
- // NonRegOp2.ChangeToFrameIndex(FrameIndex);
- // NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
- // NonRegOp2.setTargetFlags(TargetFlags);
- // }
- else if (NonRegOp1.isGlobal() && NonRegOp2.isImm()){
- auto TargetFlags = NonRegOp1.getTargetFlags();
- auto GlobalVal = NonRegOp1.getGlobal();
- auto GlobalOffset = NonRegOp1.getOffset();
- NonRegOp1.ChangeToImmediate(NonRegOp2.getImm());
- NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
- NonRegOp2.ChangeToGA(GlobalVal, GlobalOffset, TargetFlags);
- NonRegOp2.setTargetFlags(TargetFlags);
- }
- else if (NonRegOp1.isImm() && NonRegOp2.isGlobal()){
- auto TargetFlags = NonRegOp2.getTargetFlags();
- auto GlobalVal = NonRegOp2.getGlobal();
- auto GlobalOffset = NonRegOp2.getOffset();
- NonRegOp2.ChangeToImmediate(NonRegOp1.getImm());
- NonRegOp2.setTargetFlags(NonRegOp1.getTargetFlags());
- NonRegOp1.ChangeToGA(GlobalVal, GlobalOffset, TargetFlags);
- NonRegOp1.setTargetFlags(TargetFlags);
- }
else
return nullptr;
return &MI;
@@ -2824,7 +2797,7 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
if (isOperandLegal(MI, Src1Idx, &Src0))
CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
} else {
- CommutedMI = swapNonRegOperands(MI, Src1, Src0);
+ CommutedMI = swapNonRegOperands(MI, Src0, Src1);
}
if (CommutedMI) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 78ca7a2f258cb3..1774522bcc8602 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -332,7 +332,9 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
let FPDPRounding = 1 in {
let Predicates = [Has16BitInsts, isGFX8Only] in {
defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
- defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
+ let isCommutable = 1 in {
+ defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
+ } // End isCommutable = 1
} // End Predicates = [Has16BitInsts, isGFX8Only]
let SubtargetPredicate = isGFX9Plus in {
@@ -344,10 +346,14 @@ let FPDPRounding = 1 in {
let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
-defm V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-defm V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+let isCommutable = 1 in {
+ defm V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+ defm V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+} // End isCommutable = 1
let FPDPRounding = 1 in {
- defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fmad>;
+ let isCommutable = 1 in{
+ defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fmad>;
+ } // End isCommutable = 1
let Uses = [MODE, M0, EXEC] in {
let OtherPredicates = [isNotGFX90APlus] in
// For some reason the intrinsic operands are in a different order
@@ -639,8 +645,10 @@ let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
-defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
-defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+let isCommutable = 1 in{
+ defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+ defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+} // End isCommutable = 1
defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
@@ -871,7 +879,9 @@ let SubtargetPredicate = isGFX10Plus in {
def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
}
- defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
+ let isCommutable = 1 in {
+ defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
+ }
defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
diff --git a/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir b/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir
index 9b3579b43a38a3..ae3fa153f381ae 100644
--- a/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir
+++ b/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir
@@ -7,6 +7,6 @@ name: not_shrink_icmp
body: |
bb.0:
; GCN-LABEL: name: not_shrink_icmp
- ; GCN: S_CMP_GT_I32 1, 65, implicit-def $scc
+ ; GCN: S_CMP_LT_I32 65, 1, implicit-def $scc
S_CMP_GT_I32 1, 65, implicit-def $scc
...
diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
index b9397f9d5d4ddc..8394650a92266a 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
+++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
@@ -1,13 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s
-# GCN-LABEL: name: test_machine_cse_op_sel
-# GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
-# GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
-# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
---
name: test_machine_cse_op_sel
body: |
bb.0:
+ ; GCN-LABEL: name: test_machine_cse_op_sel
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[DEF]], 0, [[DEF1]], 1, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, [[V_ADD_NC_U16_e64_]], [[V_ADD_NC_U16_e64_]], 0, 1, 0, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index f3f749b5c054b3..1ca62c34ab0a24 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1669,8 +1669,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
-; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900
-; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900
+; GFX10-NEXT: v_add_nc_u16 v1, 0x900, v1
+; GFX10-NEXT: v_add_nc_u16 v5, 0x900, v2
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
@@ -1733,10 +1733,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900
+; GFX11-NEXT: v_add_nc_u16 v2, 0x900, v2
; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900
+; GFX11-NEXT: v_add_nc_u16 v1, 0x900, v1
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2
; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
index e7e56c1a3fdaa3..c18f8a6b500be7 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
@@ -920,13 +920,13 @@ body: |
; MUBUFW64-NEXT: {{ $}}
; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
- ; MUBUFW64-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
- ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec
+ ; MUBUFW64-NEXT: $vgpr1 = V_MOV_B32_e32 12, implicit $exec
+ ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $exec
; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
;
; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0__kernel__live_vcc
- ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
- ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec
+ ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 12, implicit $exec
+ ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $exec
; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
renamable $vgpr0 = V_ADD_CO_U32_e32 12, %stack.0, implicit-def $vcc, implicit $exec
SI_RETURN implicit $vgpr0, implicit $vcc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index f86c8294ab3c00..548c2df03d62c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -399,7 +399,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -410,7 +410,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -464,7 +464,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -475,7 +475,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -1350,7 +1350,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1359,7 +1359,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1404,7 +1404,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1413,7 +1413,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1466,7 +1466,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -1477,7 +1477,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -1531,7 +1531,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -1542,7 +1542,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -2571,7 +2571,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0x7f80, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s6, 0x7f, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
@@ -2589,7 +2589,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
@@ -2671,7 +2671,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v0, -1
-; GFX10CHECK-NEXT: v_add_nc_u16 v2, v0, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s5, 0x7fbf, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v1
@@ -2687,7 +2687,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v0, -1
-; GFX11CHECK-NEXT: v_add_nc_u16 v2, v0, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 7a1f05f56a7517..b649bf4015e7c2 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -409,7 +409,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7
+; GFX11-NEXT: v_add_nc_u16 v2, 0x3e7, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
More information about the llvm-commits
mailing list