[llvm] [AMDGPU] Switch V_CNDMASK operands to shrink it into VOP2 (PR #135162)

Tue Apr 15 02:00:43 PDT 2025

================
@@ -831,6 +836,183 @@ bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
   return true;
 }
 
+unsigned SIShrinkInstructions::getInverseCompareOpcode(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  // unsigned 32
+  case AMDGPU::V_CMP_EQ_U32_e64:
+    return AMDGPU::V_CMP_NE_U32_e64;
+  case AMDGPU::V_CMP_NE_U32_e64:
+    return AMDGPU::V_CMP_EQ_U32_e64;
+  case AMDGPU::V_CMP_GE_U32_e64:
+    return AMDGPU::V_CMP_LT_U32_e64;
+  case AMDGPU::V_CMP_LE_U32_e64:
+    return AMDGPU::V_CMP_GT_U32_e64;
+  case AMDGPU::V_CMP_GT_U32_e64:
+    return AMDGPU::V_CMP_LE_U32_e64;
+  case AMDGPU::V_CMP_LT_U32_e64:
+    return AMDGPU::V_CMP_GE_U32_e64;
+    // unsigned 64
+  case AMDGPU::V_CMP_EQ_U64_e64:
+    return AMDGPU::V_CMP_NE_U64_e64;
+  case AMDGPU::V_CMP_NE_U64_e64:
+    return AMDGPU::V_CMP_EQ_U64_e64;
+  case AMDGPU::V_CMP_GE_U64_e64:
+    return AMDGPU::V_CMP_LT_U64_e64;
+  case AMDGPU::V_CMP_LE_U64_e64:
+    return AMDGPU::V_CMP_GT_U64_e64;
+  case AMDGPU::V_CMP_GT_U64_e64:
+    return AMDGPU::V_CMP_LE_U64_e64;
+  case AMDGPU::V_CMP_LT_U64_e64:
+    return AMDGPU::V_CMP_GE_U64_e64;
+  // float 32
+  case AMDGPU::V_CMP_EQ_F32_e64:
+    return AMDGPU::V_CMP_NEQ_F32_e64;
+  case AMDGPU::V_CMP_NEQ_F32_e64:
+    return AMDGPU::V_CMP_EQ_F32_e64;
+  case AMDGPU::V_CMP_GE_F32_e64:
+    return AMDGPU::V_CMP_NGE_F32_e64;
+  case AMDGPU::V_CMP_NGE_F32_e64:
+    return AMDGPU::V_CMP_GE_F32_e64;
+  case AMDGPU::V_CMP_LE_F32_e64:
+    return AMDGPU::V_CMP_NLE_F32_e64;
+  case AMDGPU::V_CMP_NLE_F32_e32:
+    return AMDGPU::V_CMP_LE_F32_e32;
+  case AMDGPU::V_CMP_GT_F32_e64:
+    return AMDGPU::V_CMP_NGT_F32_e64;
+  case AMDGPU::V_CMP_NGT_F32_e64:
+    return AMDGPU::V_CMP_GT_F32_e64;
+  case AMDGPU::V_CMP_LT_F32_e64:
+    return AMDGPU::V_CMP_NLT_F32_e64;
+  case AMDGPU::V_CMP_NLT_F32_e64:
+    return AMDGPU::V_CMP_LT_F32_e64;
+  case AMDGPU::V_CMP_LG_F32_e64:
+    return AMDGPU::V_CMP_NLG_F32_e64;
+  case AMDGPU::V_CMP_NLG_F32_e64:
+    return AMDGPU::V_CMP_LG_F32_e64;
+  case AMDGPU::V_CMP_O_F32_e64:
+    return AMDGPU::V_CMP_U_F32_e64;
+  case AMDGPU::V_CMP_U_F32_e64:
+    return AMDGPU::V_CMP_O_F32_e64;
+  // float 64
+  case AMDGPU::V_CMP_EQ_F64_e64:
+    return AMDGPU::V_CMP_NEQ_F64_e64;
+  case AMDGPU::V_CMP_NEQ_F64_e64:
+    return AMDGPU::V_CMP_EQ_F64_e64;
+  case AMDGPU::V_CMP_GE_F64_e64:
+    return AMDGPU::V_CMP_NGE_F64_e64;
+  case AMDGPU::V_CMP_NGE_F64_e64:
+    return AMDGPU::V_CMP_GE_F64_e64;
+  case AMDGPU::V_CMP_LE_F64_e64:
+    return AMDGPU::V_CMP_NLE_F64_e64;
+  case AMDGPU::V_CMP_NLE_F64_e32:
+    return AMDGPU::V_CMP_LE_F64_e32;
+  case AMDGPU::V_CMP_GT_F64_e64:
+    return AMDGPU::V_CMP_NGT_F64_e64;
+  case AMDGPU::V_CMP_NGT_F64_e64:
+    return AMDGPU::V_CMP_GT_F32_e64;
+  case AMDGPU::V_CMP_LT_F64_e64:
+    return AMDGPU::V_CMP_NLT_F64_e64;
+  case AMDGPU::V_CMP_NLT_F64_e64:
+    return AMDGPU::V_CMP_LT_F64_e64;
+  case AMDGPU::V_CMP_LG_F64_e64:
+    return AMDGPU::V_CMP_NLG_F64_e64;
+  case AMDGPU::V_CMP_NLG_F64_e64:
+    return AMDGPU::V_CMP_LG_F64_e64;
+  case AMDGPU::V_CMP_O_F64_e64:
+    return AMDGPU::V_CMP_U_F64_e64;
+  case AMDGPU::V_CMP_U_F64_e64:
+    return AMDGPU::V_CMP_O_F64_e64;
+  default:
+    return 0;
+  }
+}
+
+bool SIShrinkInstructions::shouldSwapCndOperands(
+    MachineInstr &MI, SmallVector<MachineOperand *, 4> &UsesToProcess) const {
+  auto AllUses = MRI->use_nodbg_operands(MI.getOperand(0).getReg());
+  bool ShouldSwap = false;
+
+  for (auto &Use : AllUses) {
+    MachineInstr *UseInst = Use.getParent();
+    if (UseInst->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+      return false;
+    MachineOperand &Src0 = UseInst->getOperand(2);
+    MachineOperand &Src1 = UseInst->getOperand(4);
+
+    bool Src0Imm = Src0.isImm();
+    bool Src1Imm = Src1.isImm();
+
+    if (!Src1Imm && Src0Imm)
+      return false;
----------------
mbrkusanin wrote:

This bails out if there is one case where swapping operands will not be profitable. Maybe we should consider cases where it is profitable as a whole. For example:

```
define amdgpu_cs <3 x float> @test(i32 %cmp, float %arg0, float %arg1, float %arg2) {
  %vcc = icmp eq i32 %cmp, 0
  %val0 = select i1 %vcc, float 0.0, float %arg0
  %val1 = select i1 %vcc, float 0.0, float %arg1
  %val2 = select i1 %vcc, float %arg2, float 0.0
  %ret0 = insertelement <3 x float> poison, float %val0, i32 0
  %ret1 = insertelement <3 x float> %ret0, float %val1, i32 1
  %ret2 = insertelement <3 x float> %ret1, float %val2, i32 2
  ret <3 x float> %ret2
}
```

https://github.com/llvm/llvm-project/pull/135162