[llvm] [SelectionDAG] Make `(a & x) | (~a & y) -> (a & (x ^ y)) ^ y` available for all targets (PR #137641)

Mon Apr 28 07:45:22 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Iris Shi (el-ev)

<details>
<summary>Changes</summary>

Closes #83637.

By the way, based on the tests, the existing folding on the X86 target doesn't seem to be working.


---

Patch is 34.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137641.diff


8 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+55) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (-58) 
- (added) llvm/test/CodeGen/RISCV/fold-masked-merge.ll (+302) 
- (modified) llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll (+25-37) 
- (modified) llvm/test/CodeGen/X86/bitselect.ll (+40-36) 
- (modified) llvm/test/CodeGen/X86/fold-masked-merge.ll (+10-20) 
- (modified) llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll (+11-15) 
- (modified) llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll (+36-46) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 282dc4470238d..7a52acb31d2ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8108,6 +8108,57 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1,
+                                   SDValue AndR1, const SDLoc &DL,
+                                   SelectionDAG &DAG) {
+  if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse())
+    return SDValue();
+  SDValue NotOp = AndL0->getOperand(0);
+  if (NotOp == AndR1)
+    std::swap(AndR1, AndL1);
+  if (NotOp != AndL1)
+    return SDValue();
+
+  // (~(NotOp) & And0_R) | (NotOp & And1_R)
+  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
+  EVT VT = AndL1->getValueType(0);
+  SDValue FreezeAndR0 = DAG.getNode(ISD::FREEZE, SDLoc(), VT, AndR0);
+  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, FreezeAndR0);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
+  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, FreezeAndR0);
+  return Xor1;
+}
+
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for targets without a fused
+/// "and-not" operation.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
+                               const SDLoc &DL) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  SDValue N0 = Node->getOperand(0);
+  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
+    return SDValue();
+  SDValue N1 = Node->getOperand(1);
+  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
+    return SDValue();
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  SDValue N10 = N1->getOperand(0);
+  SDValue N11 = N1->getOperand(1);
+  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
+    return Result;
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8286,6 +8337,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
       return R;
 
+  if (!TLI.hasAndNot(SDValue(N, 0)) && VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG, DL))
+      return R;
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b07843523a15b..4f01345f73f94 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52005,59 +52005,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 }
 
-static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
-                                   SDValue And1_L, SDValue And1_R,
-                                   const SDLoc &DL, SelectionDAG &DAG) {
-  if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
-    return SDValue();
-  SDValue NotOp = And0_L->getOperand(0);
-  if (NotOp == And1_R)
-    std::swap(And1_R, And1_L);
-  if (NotOp != And1_L)
-    return SDValue();
-
-  // (~(NotOp) & And0_R) | (NotOp & And1_R)
-  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
-  EVT VT = And1_L->getValueType(0);
-  SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
-  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
-  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
-  return Xor1;
-}
-
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for  targets without a fused
-/// "and-not" operation. This function is intended to be called from a
-/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-  SDValue N0 = Node->getOperand(0);
-  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
-    return SDValue();
-  SDValue N1 = Node->getOperand(1);
-  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
-    return SDValue();
-
-  SDLoc DL(Node);
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-  SDValue N10 = N1->getOperand(0);
-  SDValue N11 = N1->getOperand(1);
-  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
-    return Result;
-  return SDValue();
-}
-
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
@@ -52461,11 +52408,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // We should fold "masked merge" patterns when `andn` is not available.
-  if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
-    if (SDValue R = foldMaskedMerge(N, DAG))
-      return R;
-
   if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
     return R;
 
diff --git a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
new file mode 100644
index 0000000000000..631b7109281e5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV32,RV32I
+; RUN: llc -mtriple=riscv64 < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV32,RV32ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
+;
+; test that masked-merge code is generated as "xor;and;xor" sequence or
+; "andn ; and; or" if and-not is available.
+
+define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: masked_merge0:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a1, a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge0:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
+; CHECK-I-LABEL: masked_merge1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a1, a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i16 %a0, %a1
+  %not = xor i16 %a0, -1
+  %and1 = and i16 %a2, %not
+  %or = or i16 %and0, %and1
+  ret i16 %or
+}
+
+define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
+; CHECK-I-LABEL: masked_merge2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    mv a0, a1
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    andn a2, a1, a0
+; CHECK-ZBB-NEXT:    and a0, a1, a0
+; CHECK-ZBB-NEXT:    or a0, a2, a0
+; CHECK-ZBB-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %and0 = and i8 %not, %a1
+  %and1 = and i8 %a1, %a0
+  %or = or i8 %and0, %and1
+  ret i8 %or
+}
+
+define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
+; RV32I-LABEL: masked_merge3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a5, a5
+; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    xor a3, a3, a5
+; RV32I-NEXT:    xor a2, a2, a4
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    xor a0, a0, a4
+; RV32I-NEXT:    xor a1, a1, a5
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: masked_merge3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    xor a1, a1, a2
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: masked_merge3:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a6, a0
+; RV32ZBB-NEXT:    not a7, a1
+; RV32ZBB-NEXT:    andn a1, a1, a3
+; RV32ZBB-NEXT:    andn a0, a0, a2
+; RV32ZBB-NEXT:    andn a2, a7, a5
+; RV32ZBB-NEXT:    andn a3, a6, a4
+; RV32ZBB-NEXT:    or a0, a3, a0
+; RV32ZBB-NEXT:    or a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: masked_merge3:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    not a3, a0
+; RV64ZBB-NEXT:    andn a2, a3, a2
+; RV64ZBB-NEXT:    andn a0, a0, a1
+; RV64ZBB-NEXT:    or a0, a2, a0
+; RV64ZBB-NEXT:    ret
+  %v0 = xor i64 %a1, -1
+  %v1 = xor i64 %a2, -1
+  %not = xor i64 %a0, -1
+  %and0 = and i64 %not, %v1
+  %and1 = and i64 %v0, %a0
+  %or = or i64 %and0, %and1
+  ret i64 %or
+}
+
+define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; RV32-LABEL: not_a_masked_merge0:
+; RV32:       # %bb.0:
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: not_a_masked_merge0:
+; RV64:       # %bb.0:
+; RV64-NEXT:    and a1, a0, a1
+; RV64-NEXT:    negw a0, a0
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not_a_not = sub i32 0, %a0
+  %and1 = and i32 %not_a_not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-I-LABEL: not_a_masked_merge1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a0, a0, a1
+; CHECK-I-NEXT:    not a1, a3
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a0, a0, a1
+; CHECK-ZBB-NEXT:    andn a1, a2, a3
+; CHECK-ZBB-NEXT:    or a0, a0, a1
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a3, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: not_a_masked_merge2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    or a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    or a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %not_an_and0 = or i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %not_an_and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: not_a_masked_merge3:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge3:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    orn a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %not_an_and1 = xor i32 %not, %a2
+  %or = or i32 %and0, %not_an_and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-LABEL: not_a_masked_merge4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a2, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform0:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    sw a1, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform0:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    sw a1, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and0, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a4, a0
+; CHECK-I-NEXT:    and a0, a4, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    sw a4, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    not a4, a0
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    sw a4, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %not, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a2, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a2
+; CHECK-I-NEXT:    sw a2, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a2, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a2
+; CHECK-ZBB-NEXT:    sw a2, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and1, ptr %p1
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
index 1517e524a7f78..efc8243df71e0 100644
--- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
@@ -8,16 +8,13 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
 
-; TODO: Should we convert these to X ^ ((X ^ Y) & M) form when Zbb isn't
-; present?
 
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-I-LABEL: out8:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out8:
@@ -36,10 +33,9 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-I-LABEL: out16:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out16:
@@ -58,10 +54,9 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out32:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out32:
@@ -80,22 +75,19 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; RV32I-LABEL: out64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    and a3, a3, a5
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    or a0, a0, a2
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: out64:
 ; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    not a2, a2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: out64:
@@ -660,10 +652,9 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a1, a2
-; CHECK-I-NEXT:    and a0, a2, a0
-; CHECK-I-NEXT:    andi a1, a1, 42
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a0, a0, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xori a0, a0, 42
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42:
@@ -704,10 +695,9 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a1, a2
-; CHECK-I-NEXT:    and a0, a1, a0
-; CHECK-I-NEXT:    andi a1, a2, 42
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a1, a0, 42
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    xor a0, a1, a0
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42_invmask:
@@ -812,10 +802,9 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a0, a2
-; CHECK-I-NEXT:    andi a2, a2, 42
-; CHECK-I-NEXT:    and a0, a0, a1
-; CHECK-I-NEXT:    or a0, a2, a0
+; CHECK-I-NEXT:    xori a0, a1, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary:
@@ -855,10 +844,9 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a0, a2
-; CHECK-I-NEXT:    andi a0, a0, 42
-; CHECK-I-NEXT:    and a1, a2, a1
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a0, a1, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xori a0, a0, 42
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary_invmask:
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 2922113b14ea9..48733b206d446 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been auto...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/137641