[llvm] 2b59600 - [RISCV] Branchless lowering for select (and (x , 0x1) == 0), y, (z ^ y) ) and select (and (x , 0x1) == 0), y, (z | y) )

Fri Sep 30 08:24:42 PDT 2022

Author: Philip Reames
Date: 2022-09-30T08:24:32-07:00
New Revision: 2b5960028ead83ec56c5623b2b8c8c12d91d6ed0

URL: https://github.com/llvm/llvm-project/commit/2b5960028ead83ec56c5623b2b8c8c12d91d6ed0
DIFF: https://github.com/llvm/llvm-project/commit/2b5960028ead83ec56c5623b2b8c8c12d91d6ed0.diff

LOG: [RISCV] Branchless lowering for select (and (x , 0x1) == 0), y, (z ^ y) ) and select (and (x , 0x1) == 0), y, (z | y) )

This code is directly ported from the X86 backend which applies the same rewrite (along with several others). Planning on looking more closely at the other branchless variants from x86 to see if any are worth porting in future changes.

Motivation here is the coremark crc8 routine from https://github.com/eembc/coremark/blob/main/core_util.c#L165. This patch significantly reduces the number of unpredictable branches in the workload.

Differential Revision: https://reviews.llvm.org/D134881

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/test/CodeGen/RISCV/select.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ba3eb9258edb6..228270777e3d5 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9011,14 +9011,70 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
     SDValue CC = N->getOperand(2);
+    ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get();
     SDValue TrueV = N->getOperand(3);
     SDValue FalseV = N->getOperand(4);
     SDLoc DL(N);
+    EVT VT = N->getValueType(0);
 
     // If the True and False values are the same, we don't need a select_cc.
     if (TrueV == FalseV)
       return TrueV;
 
+    // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
+    // (select (and (x , 0x1) != 0), (z ^ y) ), y -> (-(and (x , 0x1)) & z ) ^ y
+    // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
+    // (select (and (x , 0x1) != 0), (z | y) ), y -> (-(and (x , 0x1)) & z ) | y
+    if (isNullConstant(RHS) && (CCVal == ISD::SETEQ || CCVal == ISD::SETNE) &&
+        LHS.getOpcode() == ISD::AND && isOneConstant(LHS.getOperand(1))) {
+      unsigned Opcode;
+      SDValue Src1, Src2;
+      // true if FalseV is XOR or OR operator and one of its operands
+      // is equal to Op1
+      // ( a , a op b) || ( b , a op b)
+      auto isOrXorPattern = [&]() {
+        if (CCVal == ISD::SETEQ &&
+            (FalseV.getOpcode() == ISD::XOR || FalseV.getOpcode() == ISD::OR) &&
+            (FalseV.getOperand(0) == TrueV || FalseV.getOperand(1) == TrueV)) {
+          Src1 = FalseV.getOperand(0) == TrueV ?
+            FalseV.getOperand(1) : FalseV.getOperand(0);
+          Src2 = TrueV;
+          Opcode = FalseV.getOpcode();
+          return true;
+        }
+        if (CCVal == ISD::SETNE &&
+            (TrueV.getOpcode() == ISD::XOR || TrueV.getOpcode() == ISD::OR) &&
+            (TrueV.getOperand(0) == FalseV || TrueV.getOperand(1) == FalseV)) {
+          Src1 = TrueV.getOperand(0) == FalseV ?
+            TrueV.getOperand(1) : TrueV.getOperand(0);
+          Src2 = FalseV;
+          Opcode = TrueV.getOpcode();
+          return true;
+        }
+
+        return false;
+      };
+
+      if (isOrXorPattern()) {
+        SDValue Neg;
+        unsigned int CmpSz = LHS.getSimpleValueType().getSizeInBits();
+        // We need mask of all zeros or ones with same size of the other
+        // operands.
+        if (CmpSz > VT.getSizeInBits())
+          Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
+        else if (CmpSz < VT.getSizeInBits())
+          Neg = DAG.getNode(ISD::AND, DL, VT,
+                            DAG.getNode(ISD::ANY_EXTEND, DL, VT, LHS),
+                            DAG.getConstant(1, DL, VT));
+        else
+          Neg = LHS;
+        SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                                   Neg); // -(and (x, 0x1))
+        SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
+        return DAG.getNode(Opcode, DL, VT, And, Src2);           // And Op y
+      }
+    }
+
     if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
       return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
                          {LHS, RHS, CC, TrueV, FalseV});

diff  --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 95ad71abb3156..ceec72bea8f88 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1,16 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s
 
 define i16 @select_xor_1(i16 %A, i8 %cond) {
-; CHECK-LABEL: select_xor_1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andi a1, a1, 1
-; CHECK-NEXT:    beqz a1, .LBB0_2
-; CHECK-NEXT:  # %bb.1: # %entry
-; CHECK-NEXT:    xori a0, a0, 43
-; CHECK-NEXT:  .LBB0_2: # %entry
-; CHECK-NEXT:    ret
+; RV32-LABEL: select_xor_1:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    andi a1, a1, 43
+; RV32-NEXT:    xor a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: select_xor_1:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    negw a1, a1
+; RV64-NEXT:    andi a1, a1, 43
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp eq i8 %and, 0
@@ -22,14 +29,21 @@ entry:
 ; Equivalent to above, but with icmp ne (and %cond, 1), 1 instead of
 ; icmp eq (and %cond, 1), 0
 define i16 @select_xor_1b(i16 %A, i8 %cond) {
-; CHECK-LABEL: select_xor_1b:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andi a1, a1, 1
-; CHECK-NEXT:    beqz a1, .LBB1_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    xori a0, a0, 43
-; CHECK-NEXT:  .LBB1_2: # %entry
-; CHECK-NEXT:    ret
+; RV32-LABEL: select_xor_1b:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    andi a1, a1, 43
+; RV32-NEXT:    xor a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: select_xor_1b:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    negw a1, a1
+; RV64-NEXT:    andi a1, a1, 43
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp ne i8 %and, 1
@@ -42,10 +56,9 @@ define i32 @select_xor_2(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_xor_2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB2_2
-; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:  .LBB2_2: # %entry
 ; CHECK-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -61,10 +74,9 @@ define i32 @select_xor_2b(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_xor_2b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB3_2
-; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:  .LBB3_2: # %entry
 ; CHECK-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -78,10 +90,9 @@ define i32 @select_or(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_or:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB4_2
-; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    or a0, a1, a0
-; CHECK-NEXT:  .LBB4_2: # %entry
 ; CHECK-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -97,10 +108,9 @@ define i32 @select_or_b(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_or_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB5_2
-; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    or a0, a1, a0
-; CHECK-NEXT:  .LBB5_2: # %entry
 ; CHECK-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -114,10 +124,9 @@ define i32 @select_or_1(i32 %A, i32 %B, i32 %cond) {
 ; CHECK-LABEL: select_or_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB6_2
-; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    or a0, a1, a0
-; CHECK-NEXT:  .LBB6_2: # %entry
 ; CHECK-NEXT:    ret
 entry:
  %and = and i32 %cond, 1
@@ -133,10 +142,9 @@ define i32 @select_or_1b(i32 %A, i32 %B, i32 %cond) {
 ; CHECK-LABEL: select_or_1b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB7_2
-; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    or a0, a1, a0
-; CHECK-NEXT:  .LBB7_2: # %entry
 ; CHECK-NEXT:    ret
 entry:
  %and = and i32 %cond, 1