[llvm] [AMDGPU] Try to reuse in v_cndmask register with constant from compare. (PR #131146)
Daniil Fukalov via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 17 14:04:20 PDT 2025
https://github.com/dfukalov updated https://github.com/llvm/llvm-project/pull/131146
>From 1f8eabd4b3f5b357281bbed8789c7680de8a1754 Mon Sep 17 00:00:00 2001
From: Daniil Fukalov <dfukalov at gmail.com>
Date: Thu, 13 Mar 2025 14:46:19 +0100
Subject: [PATCH 1/3] [AMDGPU] Try to reuse in v_cndmask register with constant
from compare.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
For some targets, the optimization `X == Const ? X : Y` → `X == Const ? Const : Y`
can cause extra register usage for the constant in `v_cndmask`.
This patch detects such cases and reuses the register from the compare instruction
that already holds the constant, instead of materializing it again.
For SWDEV-506659.
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 73 +++++++-
.../CodeGen/AMDGPU/fold-cndmask-select.ll | 171 ++++++++++++++++++
2 files changed, 240 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 91df516b80857..f2857cd381c7e 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1411,15 +1411,80 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
return false;
+ // Try to find optimized Y == Const ? Const : Z. If Const can't be directly
+ // encoded in the cndmask, try to reuse a register already holding the Const
+ // value from the comparison instruction.
+ auto tryFoldCndMaskCmp =
+ [&](MachineOperand *SrcOp, std::optional<int64_t> SrcImm,
+ unsigned CmpOpcodes[4], AMDGPU::OpName CmpValName) -> bool {
+ // We'll try to process only register operands with known values.
+ if (!SrcImm || !SrcOp->isReg())
+ return false;
+
+ // Find the predicate of the cndmask instruction.
+ MachineOperand *PredOp = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (!PredOp || !PredOp->isReg())
+ return false;
+
+ MachineInstr *PredI = MRI->getVRegDef(PredOp->getReg());
+ if (!PredI || !PredI->isCompare())
+ return false;
+
+ unsigned CmpOpc = PredI->getOpcode();
+
+ if (CmpOpc != CmpOpcodes[0] && CmpOpc != CmpOpcodes[1] &&
+ CmpOpc != CmpOpcodes[2] && CmpOpc != CmpOpcodes[3])
+ return false;
+
+ // Check if the immediate value of the source operand matches the immediate
+ // value of either the first or second operand of the comparison
+ // instruction.
+ MachineOperand *SubstOp = nullptr;
+ std::optional<int64_t> CmpValImm = getImmOrMaterializedImm(
+ *TII->getNamedOperand(*PredI, AMDGPU::OpName::src0));
+ if (CmpValImm && *CmpValImm == *SrcImm) {
+ SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src1);
+ } else {
+ CmpValImm = getImmOrMaterializedImm(
+ *TII->getNamedOperand(*PredI, AMDGPU::OpName::src1));
+ if (CmpValImm && *CmpValImm == *SrcImm) {
+ SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src0);
+ } else {
+ return false;
+ }
+ }
+
+ if (!SubstOp || !SubstOp->isReg())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
+ SrcOp->setReg(SubstOp->getReg());
+ LLVM_DEBUG(dbgs() << MI);
+ return true;
+ };
+
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isIdenticalTo(*Src0)) {
- std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
- if (!Src1Imm)
- return false;
+ // Try to fold with not-equal comparisons
+ unsigned NECmpOpcodes[4] = {
+ AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_LG_F32_e64,
+ AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64};
std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
- if (!Src0Imm || *Src0Imm != *Src1Imm)
+ if (tryFoldCndMaskCmp(Src0, Src0Imm, NECmpOpcodes, AMDGPU::OpName::src1))
+ return true;
+
+ // Try to fold with equal comparisons
+ unsigned EQCmpOpcodes[4] = {
+ AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64,
+ AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_EQ_U32_e64};
+
+ std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
+ if (tryFoldCndMaskCmp(Src1, Src1Imm, EQCmpOpcodes, AMDGPU::OpName::src0))
+ return true;
+
+ if (!Src0Imm || !Src1Imm || *Src0Imm != *Src1Imm)
return false;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
new file mode 100644
index 0000000000000..0b7106448bb7c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
+
+define float @f32_oeq_v_i(float %arg, float %arg1) {
+; GFX9-LABEL: f32_oeq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_oeq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
+ ret float %select
+}
+
+define float @f32_oeq_i_v(float %arg, float %arg1) {
+; GFX9-LABEL: f32_oeq_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_oeq_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq float 0x3FCF5C2900000000, %arg
+ %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
+ ret float %select
+}
+
+define float @f32_one_v_i(float %arg, float %arg1) {
+; GFX9-LABEL: f32_one_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_one_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp one float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
+ ret float %select
+}
+
+define float @f32_one_i_v(float %arg, float %arg1) {
+; GFX9-LABEL: f32_one_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_one_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp one float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
+ ret float %select
+}
+
+define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_eq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_eq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp eq i32 %arg, 424242
+ %select = select i1 %icmp, i32 424242, i32 %arg1
+ ret i32 %select
+}
+
+define i32 @i32_eq_i_v(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_eq_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_eq_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp eq i32 424242, %arg
+ %select = select i1 %icmp, i32 424242, i32 %arg1
+ ret i32 %select
+}
+
+define i32 @i32_ne_v_i(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_ne_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_ne_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp ne i32 %arg, 424242
+ %select = select i1 %icmp, i32 %arg1, i32 424242
+ ret i32 %select
+}
+
+define i32 @i32_ne_i_v(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_ne_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_ne_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp ne i32 424242, %arg
+ %select = select i1 %icmp, i32 %arg1, i32 424242
+ ret i32 %select
+}
>From ffad02749be7735fd33a716285f64d17ebad3ea7 Mon Sep 17 00:00:00 2001
From: Daniil Fukalov <dfukalov at gmail.com>
Date: Thu, 13 Mar 2025 15:43:30 +0100
Subject: [PATCH 2/3] Update llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
index 0b7106448bb7c..aba198b142765 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
define float @f32_oeq_v_i(float %arg, float %arg1) {
; GFX9-LABEL: f32_oeq_v_i:
>From c5881f2a83c32446a543ed8991e8527a146149c3 Mon Sep 17 00:00:00 2001
From: Daniil Fukalov <dfukalov at gmail.com>
Date: Mon, 17 Mar 2025 22:02:59 +0100
Subject: [PATCH 3/3] added half and i16 types support
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 21 +-
.../CodeGen/AMDGPU/fold-cndmask-select.ll | 191 ++++++++++++++++++
2 files changed, 204 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index f2857cd381c7e..0c60830ebddd0 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1416,7 +1416,7 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
// value from the comparison instruction.
auto tryFoldCndMaskCmp =
[&](MachineOperand *SrcOp, std::optional<int64_t> SrcImm,
- unsigned CmpOpcodes[4], AMDGPU::OpName CmpValName) -> bool {
+ ArrayRef<unsigned> CmpOpcodes, AMDGPU::OpName CmpValName) -> bool {
// We'll try to process only register operands with known values.
if (!SrcImm || !SrcOp->isReg())
return false;
@@ -1432,8 +1432,10 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
unsigned CmpOpc = PredI->getOpcode();
- if (CmpOpc != CmpOpcodes[0] && CmpOpc != CmpOpcodes[1] &&
- CmpOpc != CmpOpcodes[2] && CmpOpc != CmpOpcodes[3])
+ // Check if the comparison instruction is one of the expected ones.
+ const auto *CmpOpcI = find_if(
+ CmpOpcodes, [CmpOpc](unsigned Opcode) { return Opcode == CmpOpc; });
+ if (CmpOpcI == CmpOpcodes.end())
return false;
// Check if the immediate value of the source operand matches the immediate
@@ -1467,18 +1469,21 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isIdenticalTo(*Src0)) {
// Try to fold with not-equal comparisons
- unsigned NECmpOpcodes[4] = {
+ unsigned NECmpOpcodes[] = {
AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_LG_F32_e64,
- AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64};
+ AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64,
+ AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_I16_e64,
+ AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_LG_F16_e64};
std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
if (tryFoldCndMaskCmp(Src0, Src0Imm, NECmpOpcodes, AMDGPU::OpName::src1))
return true;
// Try to fold with equal comparisons
- unsigned EQCmpOpcodes[4] = {
- AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64,
- AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_EQ_U32_e64};
+ unsigned EQCmpOpcodes[] = {
+ AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_I32_e64,
+ AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U16_e64,
+ AMDGPU::V_CMP_EQ_I16_e64, AMDGPU::V_CMP_EQ_F16_e64};
std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
if (tryFoldCndMaskCmp(Src1, Src1Imm, EQCmpOpcodes, AMDGPU::OpName::src0))
diff --git a/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
index aba198b142765..5bb0b561230b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
@@ -2,6 +2,29 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
+define bfloat @bf16_oeq_v_i(bfloat %arg, bfloat %arg1) {
+; GFX9-LABEL: bf16_oeq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: s_mov_b32 s4, 0x42420000
+; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, s4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: bf16_oeq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x42420000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq bfloat %arg, 0xR4242
+ %select = select i1 %fcmp, bfloat %arg, bfloat %arg1
+ ret bfloat %select
+}
+
define float @f32_oeq_v_i(float %arg, float %arg1) {
; GFX9-LABEL: f32_oeq_v_i:
; GFX9: ; %bb.0: ; %bb
@@ -86,6 +109,90 @@ bb:
ret float %select
}
+define half @f16_oeq_v_i(half %arg, half %arg1) {
+; GFX9-LABEL: f16_oeq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x5140
+; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f16_oeq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0x5140, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq half %arg, 42.0
+ %select = select i1 %fcmp, half 42.0, half %arg1
+ ret half %select
+}
+
+define half @f16_oeq_i_v(half %arg, half %arg1) {
+; GFX9-LABEL: f16_oeq_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x5140
+; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f16_oeq_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0x5140, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq half 42.0, %arg
+ %select = select i1 %fcmp, half 42.0, half %arg1
+ ret half %select
+}
+
+define half @f16_one_v_i(half %arg, half %arg1) {
+; GFX9-LABEL: f16_one_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x5140
+; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f16_one_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x5140, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp one half %arg, 42.0
+ %select = select i1 %fcmp, half %arg1, half 42.0
+ ret half %select
+}
+
+define half @f16_one_i_v(half %arg, half %arg1) {
+; GFX9-LABEL: f16_one_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x5140
+; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f16_one_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x5140, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp one half %arg, 42.0
+ %select = select i1 %fcmp, half %arg1, half 42.0
+ ret half %select
+}
+
define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) {
; GFX9-LABEL: i32_eq_v_i:
; GFX9: ; %bb.0: ; %bb
@@ -169,3 +276,87 @@ bb:
%select = select i1 %icmp, i32 %arg1, i32 424242
ret i32 %select
}
+
+define i16 @i16_eq_v_i(i16 %arg, i16 %arg1) {
+; GFX9-LABEL: i16_eq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x1092
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i16_eq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp eq i16 %arg, 4242
+ %select = select i1 %icmp, i16 4242, i16 %arg1
+ ret i16 %select
+}
+
+define i16 @i16_eq_i_v(i16 %arg, i16 %arg1) {
+; GFX9-LABEL: i16_eq_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x1092
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i16_eq_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp eq i16 4242, %arg
+ %select = select i1 %icmp, i16 4242, i16 %arg1
+ ret i16 %select
+}
+
+define i16 @i16_ne_v_i(i16 %arg, i16 %arg1) {
+; GFX9-LABEL: i16_ne_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x1092
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i16_ne_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp ne i16 %arg, 4242
+ %select = select i1 %icmp, i16 %arg1, i16 4242
+ ret i16 %select
+}
+
+define i16 @i16_ne_i_v(i16 %arg, i16 %arg1) {
+; GFX9-LABEL: i16_ne_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x1092
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i16_ne_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp ne i16 4242, %arg
+ %select = select i1 %icmp, i16 %arg1, i16 4242
+ ret i16 %select
+}
More information about the llvm-commits
mailing list