[llvm] 7c58d63 - [AMDGPU] Add commute for some VOP3 inst (#121326)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 21 20:08:29 PST 2025
Author: Shoreshen
Date: 2025-01-22T11:08:26+07:00
New Revision: 7c58d6363a40fc6d1cdf6a147da8f3bb0d4f96ec
URL: https://github.com/llvm/llvm-project/commit/7c58d6363a40fc6d1cdf6a147da8f3bb0d4f96ec
DIFF: https://github.com/llvm/llvm-project/commit/7c58d6363a40fc6d1cdf6a147da8f3bb0d4f96ec.diff
LOG: [AMDGPU] Add commute for some VOP3 inst (#121326)
add commute for some VOP3 inst, allow commute for both inline constant
operand, adjust tests
Fixes #111205
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/test/CodeGen/AMDGPU/carryout-selection.ll
llvm/test/CodeGen/AMDGPU/cmp_shrink.mir
llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
llvm/test/CodeGen/AMDGPU/ctlz.ll
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8fc32d9e60bf20..5c20f28b3d9de2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2749,6 +2749,63 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
return &MI;
}
+static MachineInstr *swapImmOperands(MachineInstr &MI,
+ MachineOperand &NonRegOp1,
+ MachineOperand &NonRegOp2) {
+ unsigned TargetFlags = NonRegOp1.getTargetFlags();
+ int64_t NonRegVal = NonRegOp1.getImm();
+
+ NonRegOp1.setImm(NonRegOp2.getImm());
+ NonRegOp2.setImm(NonRegVal);
+ NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
+ NonRegOp2.setTargetFlags(TargetFlags);
+ return &MI;
+}
+
+bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
+ const MachineOperand *MO0, unsigned OpIdx1,
+ const MachineOperand *MO1) const {
+ const MCInstrDesc &InstDesc = MI.getDesc();
+ const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
+ const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
+ const TargetRegisterClass *DefinedRC1 =
+ OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
+ const TargetRegisterClass *DefinedRC0 =
+ OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
+
+ unsigned Opc = MI.getOpcode();
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+
+ // Swap doesn't breach constant bus or literal limits
+ // It may move literal to position other than src0, this is not allowed
+ // pre-gfx10 However, most test cases need literals in Src0 for VOP
+ // FIXME: After gfx9, literal can be in place other than Src0
+ if (isVALU(MI)) {
+ if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
+ !isInlineConstant(*MO0, OpInfo1))
+ return false;
+ if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
+ !isInlineConstant(*MO1, OpInfo0))
+ return false;
+ }
+
+ if (OpIdx1 != Src0Idx && MO0->isReg()) {
+ if (!DefinedRC1)
+ return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
+ return isLegalRegOperand(MI, OpIdx1, *MO0);
+ }
+ if (OpIdx0 != Src0Idx && MO1->isReg()) {
+ if (!DefinedRC0)
+ return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
+ return isLegalRegOperand(MI, OpIdx0, *MO1);
+ }
+
+ // No need to check 64-bit literals since swapping does not bring new
+ // 64-bit literals into current instruction to fold to 32-bit
+
+ return isImmOperandLegal(MI, OpIdx1, *MO0);
+}
+
MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned Src0Idx,
unsigned Src1Idx) const {
@@ -2770,21 +2827,20 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
MachineOperand &Src0 = MI.getOperand(Src0Idx);
MachineOperand &Src1 = MI.getOperand(Src1Idx);
-
+ if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
+ return nullptr;
+ }
MachineInstr *CommutedMI = nullptr;
if (Src0.isReg() && Src1.isReg()) {
- if (isOperandLegal(MI, Src1Idx, &Src0)) {
- // Be sure to copy the source modifiers to the right place.
- CommutedMI
- = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
- }
-
+ // Be sure to copy the source modifiers to the right place.
+ CommutedMI =
+ TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
} else if (Src0.isReg() && !Src1.isReg()) {
- if (isOperandLegal(MI, Src1Idx, &Src0))
- CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
+ CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
} else if (!Src0.isReg() && Src1.isReg()) {
- if (isOperandLegal(MI, Src1Idx, &Src0))
- CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
+ CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
+ } else if (Src0.isImm() && Src1.isImm()) {
+ CommutedMI = swapImmOperands(MI, Src0, Src1);
} else {
// FIXME: Found two non registers to commute. This does happen.
return nullptr;
@@ -5817,6 +5873,49 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
return RC->hasSuperClassEq(DRC);
}
+bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
+ const MachineOperand &MO) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
+ unsigned Opc = MI.getOpcode();
+
+ if (!isLegalRegOperand(MRI, OpInfo, MO))
+ return false;
+
+ // check Accumulate GPR operand
+ bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
+ if (IsAGPR && !ST.hasMAIInsts())
+ return false;
+ if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+ (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
+ return false;
+ // Atomics should have both vdst and vdata either vgpr or agpr.
+ const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ const int DataIdx = AMDGPU::getNamedOperandIdx(
+ Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
+ if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
+ MI.getOperand(DataIdx).isReg() &&
+ RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
+ return false;
+ if ((int)OpIdx == DataIdx) {
+ if (VDstIdx != -1 &&
+ RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
+ return false;
+ // DS instructions with 2 src operands also must have tied RC.
+ const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
+ if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
+ RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
+ return false;
+ }
+
+ // Check V_ACCVGPR_WRITE_B32_e64
+ if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
+ (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
+ RI.isSGPRReg(MRI, MO.getReg()))
+ return false;
+ return true;
+}
+
bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,
const MachineOperand &MO) const {
@@ -5879,40 +5978,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
if (MO->isReg()) {
if (!DefinedRC)
return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
- if (!isLegalRegOperand(MRI, OpInfo, *MO))
- return false;
- bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
- if (IsAGPR && !ST.hasMAIInsts())
- return false;
- unsigned Opc = MI.getOpcode();
- if (IsAGPR &&
- (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
- (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
- return false;
- // Atomics should have both vdst and vdata either vgpr or agpr.
- const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
- const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
- isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
- if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
- MI.getOperand(DataIdx).isReg() &&
- RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
- return false;
- if ((int)OpIdx == DataIdx) {
- if (VDstIdx != -1 &&
- RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
- return false;
- // DS instructions with 2 src operands also must have tied RC.
- const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
- AMDGPU::OpName::data1);
- if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
- RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
- return false;
- }
- if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
- (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
- RI.isSGPRReg(MRI, MO->getReg()))
- return false;
- return true;
+ return isLegalRegOperand(MI, OpIdx, *MO);
}
if (MO->isImm()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d49939bf81b106..a609c9abbad015 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -193,7 +193,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0, unsigned Src0OpName,
MachineOperand &Src1, unsigned Src1OpName) const;
-
+ bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx,
+ const MachineOperand *fromMO, unsigned toIdx,
+ const MachineOperand *toMO) const;
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx0,
unsigned OpIdx1) const override;
@@ -1218,11 +1220,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const MachineOperand &MO) const;
/// Check if \p MO (a register operand) is a legal register for the
- /// given operand description.
+ /// given operand description or operand index.
+ /// The operand index version provide more legality checks
bool isLegalRegOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,
const MachineOperand &MO) const;
-
+ bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
+ const MachineOperand &MO) const;
/// Legalize operands in \p MI by either commuting it or inserting a
/// copy of src1.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index e96369b5e6e240..947ac5c27620f0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -335,7 +335,9 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
let FPDPRounding = 1 in {
let Predicates = [Has16BitInsts, isGFX8Only] in {
defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
- defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
+ let isCommutable = 1 in {
+ defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
+ } // End isCommutable = 1
} // End Predicates = [Has16BitInsts, isGFX8Only]
let SubtargetPredicate = isGFX9Plus in {
@@ -639,8 +641,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
-defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
-defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+let isCommutable = 1 in {
+ defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+ defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+} // End isCommutable = 1
defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
@@ -1254,8 +1258,9 @@ let SubtargetPredicate = isGFX10Plus in {
def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
}
-
- defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
+ let isCommutable = 1 in {
+ defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
+ } // End isCommutable = 1
defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>;
} // End SubtargetPredicate = isGFX10Plus
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index fc896150591528..cdea4fd158b04c 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -355,7 +355,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2
+; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: s_endpgm
@@ -365,7 +365,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: s_endpgm
@@ -375,7 +375,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3]
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: s_endpgm
@@ -387,7 +387,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir b/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir
index 9b3579b43a38a3..ae3fa153f381ae 100644
--- a/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir
+++ b/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir
@@ -7,6 +7,6 @@ name: not_shrink_icmp
body: |
bb.0:
; GCN-LABEL: name: not_shrink_icmp
- ; GCN: S_CMP_GT_I32 1, 65, implicit-def $scc
+ ; GCN: S_CMP_LT_I32 65, 1, implicit-def $scc
S_CMP_GT_I32 1, 65, implicit-def $scc
...
diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
index b9397f9d5d4ddc..9274c995dde92d 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
+++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
@@ -1,12 +1,11 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s
-# GCN-LABEL: name: test_machine_cse_op_sel
-# GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
-# GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
-# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
---
-name: test_machine_cse_op_sel
+name: test_machine_cse_op_sel_v_add_nc_u16
body: |
+ ; GCN-LABEL: name: test_machine_cse_op_sel_v_add_nc_u16
+ ; GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %2, 0, 1, 0, implicit $exec
bb.0:
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
@@ -15,3 +14,110 @@ body: |
DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
...
+---
+name: test_machine_cse_op_sel_const_v_add_nc_u16
+body: |
+ ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_add_nc_u16
+ ; GCN: %0:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %2:vgpr_32, %0, %0, 0, 1, 0, implicit $exec
+ bb.0:
+ %1:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec
+ %2:vgpr_32 = V_ADD_NC_U16_e64 0, -3, 0, 64, 1, 0, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %1, %2, 0, 1, 0, implicit $exec
+...
+
+---
+name: test_machine_cse_op_sel_v_fma_f16
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: test_machine_cse_op_sel_v_fma_f16
+ ; GCN: %3:vgpr_32 = nofpexcept V_FMA_F16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ %3:vgpr_32 = nofpexcept V_FMA_F16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+ %4:vgpr_32 = nofpexcept V_FMA_F16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %4, 0, 1, 0, implicit $exec
+...
+
+---
+name: test_machine_cse_op_sel_const_v_fma_f16
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_fma_f16
+ ; GCN: %1:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 3481272320, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 1, 0, 3481272320, 0, %0, 0, 0, implicit $mode, implicit $exec
+ %2:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 3481272320, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec
+...
+
+---
+name: test_machine_cse_op_sel_v_mad_u16
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: test_machine_cse_op_sel_v_mad_u16
+ ; GCN: %3:vgpr_32 = V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ %3:vgpr_32 = V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+ %4:vgpr_32 = V_MAD_U32_U16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %4, 0, 1, 0, implicit $exec
+...
+
+---
+name: test_machine_cse_op_sel_const_v_mad_u16
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_mad_u16
+ ; GCN: %1:vgpr_32 = V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec
+ %2:vgpr_32 = V_MAD_U32_U16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec
+...
+
+---
+name: test_machine_cse_op_sel_v_mad_i16
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: test_machine_cse_op_sel_v_mad_i16
+ ; GCN: %3:vgpr_32 = V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ %3:vgpr_32 = V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+ %4:vgpr_32 = V_MAD_I32_I16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %4, 0, 1, 0, implicit $exec
+...
+
+---
+name: test_machine_cse_op_sel_const_v_mad_i16
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_mad_i16
+ ; GCN: %1:vgpr_32 = V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec
+ %2:vgpr_32 = V_MAD_I32_I16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 3019d4d298eb45..b4d450a90d5950 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe8
+; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe8, v1
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
@@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe7
+; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe7, v1
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index b897e1feed5d56..fec020a296b9b4 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1657,8 +1657,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
-; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900
-; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900
+; GFX10-NEXT: v_add_nc_u16 v1, 0x900, v1
+; GFX10-NEXT: v_add_nc_u16 v5, 0x900, v2
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
@@ -1723,10 +1723,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900
+; GFX11-NEXT: v_add_nc_u16 v2, 0x900, v2
; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900
+; GFX11-NEXT: v_add_nc_u16 v1, 0x900, v1
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2
; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index f416131e3d3140..480d978fa530b4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -397,7 +397,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -408,7 +408,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -462,7 +462,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -473,7 +473,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -1348,7 +1348,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1357,7 +1357,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1402,7 +1402,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1411,7 +1411,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1464,7 +1464,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -1475,7 +1475,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -1529,7 +1529,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -1540,7 +1540,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -2569,7 +2569,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0x7f80, v0
-; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s6, 0x7f, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
@@ -2587,7 +2587,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0
-; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
@@ -2669,7 +2669,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v0, -1
-; GFX10CHECK-NEXT: v_add_nc_u16 v2, v0, 0xff80
+; GFX10CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s5, 0x7fbf, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v1
@@ -2685,7 +2685,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v0, -1
-; GFX11CHECK-NEXT: v_add_nc_u16 v2, v0, 0xff80
+; GFX11CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 7e3634fdf4ebba..01528cdf7c1254 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1329,7 +1329,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
+; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
@@ -1368,7 +1368,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, 0xffc0
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v1.l, 0xffc0, v1.l
; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
@@ -1381,7 +1381,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 0xffc0
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-GISEL-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1514,7 +1514,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
+; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
@@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 0xffc0
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1577,7 +1577,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 0xffc0
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-GISEL-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -1746,8 +1746,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
-; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0
+; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
+; GFX10-GISEL-NEXT: v_add_nc_u16 v2, 0xffc0, v2
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-GISEL-NEXT: global_store_short v0, v2, s[0:1]
@@ -1808,8 +1808,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, 0xffc0
-; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, 0xffc0
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v1.l, 0xffc0, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v2.l, 0xffc0, v2.l
; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1] dlc
@@ -1827,8 +1827,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 0xffc0
-; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v2, 0xffc0
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, 0xffc0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, 0xffc0, v2
; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1] dlc
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index dd03fb62b8ebb0..82fae44e208186 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -397,7 +397,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7
+; GFX11-NEXT: v_add_nc_u16 v2, 0x3e7, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
More information about the llvm-commits
mailing list