[llvm] [RISCV] Add basic scalar support for MERGE, MVM, and MVMN from P extension (PR #180677)

Mon Feb 9 21:08:46 PST 2026

https://github.com/topperc created https://github.com/llvm/llvm-project/pull/180677

These are 3 variations of the same operation with a different operand
tied to the destination register. We need to pick the one that
minimizes the number of mvs.

To do this we take the approach used by AArch64 to select between
BIT, BIF, and BSL which the same operations. We define a pseudo
with no tied constraint and expand it after register allocation based
on where the destination register ended up. If the destination
register is none of the operands, we'll insert a mv.

I've replaced RISCVISD::MVM with RISCVISD::MERGE and updated the operand
order accordingly. I find the MERGE name easier to read so I've made it
the canonical name.

Ideally we could use commuteInstructionImpl and the TwoAddressInstructionPass
to select the opcode before register allocation. That only works if
you can commute exactly 2 operands and maybe change the opcode in the MI
representation of any of the forms to get to the either of the other 2 forms.
That is not possible. We'd need to define 3 more pseudoinstructions
with different permutations.

With the current approach it might be possible that we insert a mv
not because all of the operand registers we needed by later instructions,
but because the register allocator needed to put the result in a
different register. It's possible a different allocation for other
instructions might have avoided the mv.

I wrote the patch based on the AArch64, but the tests were generated
by AI.

>From d25eb8381a15a370c0038082a36f6962e5ab7689 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 9 Feb 2026 20:06:47 -0800
Subject: [PATCH 1/2] Pre-commit tests

---
 llvm/test/CodeGen/RISCV/rv32p.ll | 154 +++++++++++++++++++++++++++++++
 llvm/test/CodeGen/RISCV/rv64p.ll | 154 +++++++++++++++++++++++++++++++
 2 files changed, 308 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
index 1e31983df0b8c..ba59c2d200fa9 100644
--- a/llvm/test/CodeGen/RISCV/rv32p.ll
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -725,3 +725,157 @@ define void @wmaccu_multiple_uses(i32 %a, i32 %b, i64 %c, ptr %out1, ptr %out2)
   store i64 %mul, ptr %out2
   ret void
 }
+
+; Test bitwise merge: (mask & b) | (~mask & a)
+define i32 @merge_i32(i32 %mask, i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: merge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a2, a0, a2
+; CHECK-NEXT:    andn a0, a1, a0
+; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    ret
+  %and1 = and i32 %mask, %b
+  %not = xor i32 %mask, -1
+  %and2 = and i32 %not, %a
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+; Test MERGE with swapped a/b arguments
+define i32 @merge_i32_2(i32 %mask, i32 %b, i32 %a) nounwind {
+; CHECK-LABEL: merge_i32_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a1, a0, a1
+; CHECK-NEXT:    andn a0, a2, a0
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %and1 = and i32 %mask, %b
+  %not = xor i32 %mask, -1
+  %and2 = and i32 %not, %a
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+; Test MVM: result overwrites rs1 (%a)
+define i32 @mvm_i32(i32 %a, i32 %mask, i32 %b) nounwind {
+; CHECK-LABEL: mvm_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a2, a1, a2
+; CHECK-NEXT:    andn a0, a0, a1
+; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    ret
+  %and1 = and i32 %mask, %b
+  %not = xor i32 %mask, -1
+  %and2 = and i32 %not, %a
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+; Test MVM with mask as last argument
+define i32 @mvm_i32_2(i32 %a, i32 %b, i32 %mask) nounwind {
+; CHECK-LABEL: mvm_i32_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    andn a0, a0, a2
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %and1 = and i32 %mask, %b
+  %not = xor i32 %mask, -1
+  %and2 = and i32 %not, %a
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+; Test MVMN: result overwrites rs2 (%b)
+define i32 @mvmn_i32(i32 %b, i32 %mask, i32 %a) nounwind {
+; CHECK-LABEL: mvmn_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    andn a1, a2, a1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %and1 = and i32 %mask, %b
+  %not = xor i32 %mask, -1
+  %and2 = and i32 %not, %a
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+; Test MVMN with mask as last argument
+define i32 @mvmn_i32_2(i32 %b, i32 %a, i32 %mask) nounwind {
+; CHECK-LABEL: mvmn_i32_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    andn a1, a1, a2
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %and1 = and i32 %mask, %b
+  %not = xor i32 %mask, -1
+  %and2 = and i32 %not, %a
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+; Test case where none of the source operands can be overwritten,
+; requiring a mv before merge
+define i32 @merge_i32_mv(i32 %mask, i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: merge_i32_mv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a3, a0, a2
+; CHECK-NEXT:    andn a4, a1, a0
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    or a3, a3, a4
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, a3, a0
+; CHECK-NEXT:    ret
+  %and1 = and i32 %mask, %b
+  %not = xor i32 %mask, -1
+  %and2 = and i32 %not, %a
+  %or = or i32 %and1, %and2
+  %sum1 = add i32 %or, %mask
+  %sum2 = add i32 %sum1, %a
+  %sum3 = add i32 %sum2, %b
+  ret i32 %sum3
+}
+
+; Test alternate merge pattern: (a ^ b) & mask ^ a
+define i32 @merge_xor_i32(i32 %mask, i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: merge_xor_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andn a1, a1, a0
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %xor1 = xor i32 %a, %b
+  %and = and i32 %xor1, %mask
+  %xor2 = xor i32 %and, %a
+  ret i32 %xor2
+}
+
+; Test alternate merge pattern with different argument order for MVM
+define i32 @mvm_xor_i32(i32 %a, i32 %mask, i32 %b) nounwind {
+; CHECK-LABEL: mvm_xor_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andn a0, a0, a1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %xor1 = xor i32 %a, %b
+  %and = and i32 %xor1, %mask
+  %xor2 = xor i32 %and, %a
+  ret i32 %xor2
+}
+
+; Test alternate merge pattern with different argument order for MVMN
+define i32 @mvmn_xor_i32(i32 %b, i32 %mask, i32 %a) nounwind {
+; CHECK-LABEL: mvmn_xor_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andn a2, a2, a1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    or a0, a0, a2
+; CHECK-NEXT:    ret
+  %xor1 = xor i32 %a, %b
+  %and = and i32 %xor1, %mask
+  %xor2 = xor i32 %and, %a
+  ret i32 %xor2
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
index 2d6d615d9f7b9..db0fea014d32b 100644
--- a/llvm/test/CodeGen/RISCV/rv64p.ll
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -378,3 +378,157 @@ define i128 @srxi_i128(i128 %x) {
   %a = lshr i128 %x, 49
   ret i128 %a
 }
+
+; Test bitwise merge: (mask & b) | (~mask & a)
+define i64 @merge_i64(i64 %mask, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: merge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a2, a0, a2
+; CHECK-NEXT:    andn a0, a1, a0
+; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    ret
+  %and1 = and i64 %mask, %b
+  %not = xor i64 %mask, -1
+  %and2 = and i64 %not, %a
+  %or = or i64 %and1, %and2
+  ret i64 %or
+}
+
+; Test MERGE with swapped a/b arguments
+define i64 @merge_i64_2(i64 %mask, i64 %b, i64 %a) nounwind {
+; CHECK-LABEL: merge_i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a1, a0, a1
+; CHECK-NEXT:    andn a0, a2, a0
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %and1 = and i64 %mask, %b
+  %not = xor i64 %mask, -1
+  %and2 = and i64 %not, %a
+  %or = or i64 %and1, %and2
+  ret i64 %or
+}
+
+; Test MVM: result overwrites rs1 (%a)
+define i64 @mvm_i64(i64 %a, i64 %mask, i64 %b) nounwind {
+; CHECK-LABEL: mvm_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a2, a1, a2
+; CHECK-NEXT:    andn a0, a0, a1
+; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    ret
+  %and1 = and i64 %mask, %b
+  %not = xor i64 %mask, -1
+  %and2 = and i64 %not, %a
+  %or = or i64 %and1, %and2
+  ret i64 %or
+}
+
+; Test MVM with mask as last argument
+define i64 @mvm_i64_2(i64 %a, i64 %b, i64 %mask) nounwind {
+; CHECK-LABEL: mvm_i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    andn a0, a0, a2
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %and1 = and i64 %mask, %b
+  %not = xor i64 %mask, -1
+  %and2 = and i64 %not, %a
+  %or = or i64 %and1, %and2
+  ret i64 %or
+}
+
+; Test MVMN: result overwrites rs2 (%b)
+define i64 @mvmn_i64(i64 %b, i64 %mask, i64 %a) nounwind {
+; CHECK-LABEL: mvmn_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    andn a1, a2, a1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %and1 = and i64 %mask, %b
+  %not = xor i64 %mask, -1
+  %and2 = and i64 %not, %a
+  %or = or i64 %and1, %and2
+  ret i64 %or
+}
+
+; Test MVMN with mask as last argument
+define i64 @mvmn_i64_2(i64 %b, i64 %a, i64 %mask) nounwind {
+; CHECK-LABEL: mvmn_i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    andn a1, a1, a2
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %and1 = and i64 %mask, %b
+  %not = xor i64 %mask, -1
+  %and2 = and i64 %not, %a
+  %or = or i64 %and1, %and2
+  ret i64 %or
+}
+
+; Test case where none of the source operands can be overwritten,
+; requiring a mv before merge
+define i64 @merge_i64_mv(i64 %mask, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: merge_i64_mv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a3, a0, a2
+; CHECK-NEXT:    andn a4, a1, a0
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    or a3, a3, a4
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, a3, a0
+; CHECK-NEXT:    ret
+  %and1 = and i64 %mask, %b
+  %not = xor i64 %mask, -1
+  %and2 = and i64 %not, %a
+  %or = or i64 %and1, %and2
+  %sum1 = add i64 %or, %mask
+  %sum2 = add i64 %sum1, %a
+  %sum3 = add i64 %sum2, %b
+  ret i64 %sum3
+}
+
+; Test alternate merge pattern: (a ^ b) & mask ^ a
+define i64 @merge_xor_i64(i64 %mask, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: merge_xor_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andn a1, a1, a0
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %xor1 = xor i64 %a, %b
+  %and = and i64 %xor1, %mask
+  %xor2 = xor i64 %and, %a
+  ret i64 %xor2
+}
+
+; Test alternate merge pattern with different argument order for MVM
+define i64 @mvm_xor_i64(i64 %a, i64 %mask, i64 %b) nounwind {
+; CHECK-LABEL: mvm_xor_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andn a0, a0, a1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %xor1 = xor i64 %a, %b
+  %and = and i64 %xor1, %mask
+  %xor2 = xor i64 %and, %a
+  ret i64 %xor2
+}
+
+; Test alternate merge pattern with different argument order for MVMN
+define i64 @mvmn_xor_i64(i64 %b, i64 %mask, i64 %a) nounwind {
+; CHECK-LABEL: mvmn_xor_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andn a2, a2, a1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    or a0, a0, a2
+; CHECK-NEXT:    ret
+  %xor1 = xor i64 %a, %b
+  %and = and i64 %xor1, %mask
+  %xor2 = xor i64 %and, %a
+  ret i64 %xor2
+}

>From 341bee8e544f00ca5fd52dac885b4706dc3a44eb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 9 Feb 2026 16:23:11 -0800
Subject: [PATCH 2/2] [RISCV] Add basic scalar support for MERGE, MVM, and
 MVMN.

These are 3 variations of the same operation with a different operand
tied to the destination register. We need to pick the one that
minimizes the number of mvs.

To do this we take the approach used by AArch64 to select between
BIT, BIF, and BSL which the same operations. We define a pseudo
with no tied constraint and expand it after register allocation based
on where the destination register ended up. If the destination
register is none of the operands, we'll insert a mv.

I've replaced RISCVISD::MVM with RISCVISD::MERGE and updated the operand
order accordingly. I find the MERGE name easier to read so I've made it
the canonical name.

Ideally we could use commuteInstructionImpl and the TwoAddressInstructionPass
to select the opcode before register allocation. That only works if
you can commute exactly 2 operands and maybe change the opcode in the MI
representation of any of the forms to get to the either of the other 2 forms.
That is not possible. We'd need to define 3 more pseudoinstructions
with different permutations.

With the current approach it might be possible that we insert a mv
not because all of the operand registers we needed by later instructions,
but because the register allocator needed to put the result in a
different register. It's possible a different allocation for other
instructions might have avoided the mv.

I wrote the patch based on the AArch64, but the tests were generated
by AI.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoP.td      | 22 ++++--
 .../RISCV/RISCVPostRAExpandPseudoInsts.cpp    | 74 +++++++++++++++++++
 llvm/test/CodeGen/RISCV/rv32p.ll              | 43 +++--------
 llvm/test/CodeGen/RISCV/rv64p.ll              | 43 +++--------
 5 files changed, 114 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 975baa7e2e504..d6d604d288fc6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10758,7 +10758,7 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
                                        DAG.getConstant(ShiftAmt, DL, XLenVT));
       SDValue Mask = DAG.getConstant(PosMask, DL, XLenVT);
       SDValue Result =
-          DAG.getNode(RISCVISD::MVM, DL, XLenVT, Vec, ShiftedVal, Mask);
+          DAG.getNode(RISCVISD::MERGE, DL, XLenVT, Mask, Vec, ShiftedVal);
       return DAG.getBitcast(VecVT, Result);
     }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 774e1e024a4be..6f9c167e20a1d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1517,13 +1517,14 @@ def riscv_mulhr : RVSDNode<"MULHR", SDTIntBinOp>;
 def riscv_mulhru : RVSDNode<"MULHRU", SDTIntBinOp>;
 def riscv_mulhrsu : RVSDNode<"MULHRSU", SDTIntBinOp>;
 
-def SDT_RISCVMVM : SDTypeProfile<1, 3, [SDTCisInt<0>,
-                                        SDTCisSameAs<0, 1>,
-                                        SDTCisSameAs<0, 2>,
-                                        SDTCisSameAs<0, 3>]>;
-def riscv_mvm : RVSDNode<"MVM", SDT_RISCVMVM>;
+def SDT_RISCVMERGE : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>,
+                                          SDTCisSameAs<0, 3>]>;
+def riscv_merge : RVSDNode<"MERGE", SDT_RISCVMERGE>;
 
 let Predicates = [HasStdExtP] in {
+
   def : PatGpr<abs, ABS>;
   def : PatGpr<ctls, CLS>;
 
@@ -1532,9 +1533,16 @@ let Predicates = [HasStdExtP] in {
   def : Pat<(XLenVT (fshr GPR:$rs1, GPR:$rd, shiftMaskXLen:$rs2)),
             (SRX GPR:$rd, GPR:$rs1, shiftMaskXLen:$rs2)>;
 
+  // Pseudo version of MERGE without the tied constraint. Will be expanded to
+  // MERGE, MVM, or MVMN after register allocation.
+  def PseudoMERGE : Pseudo<(outs GPR:$dst), (ins GPR:$rd, GPR:$rs1, GPR:$rs2),
+                           []>;
+  def : Pat<(XLenVT (or (and GPR:$rd, GPR:$rs2), (and (not GPR:$rd), GPR:$rs1))),
+            (PseudoMERGE GPR:$rd, GPR:$rs1, GPR:$rs2)>;
+
   // Pattern for insert_vector_elt
-  def : Pat<(XLenVT (riscv_mvm GPR:$rd, GPR:$rs1, GPR:$rs2)),
-            (MVM GPR:$rd, GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(XLenVT (riscv_merge GPR:$rd, GPR:$rs1, GPR:$rs2)),
+            (PseudoMERGE GPR:$rd, GPR:$rs1, GPR:$rs2)>;
 
   // Basic 8-bit arithmetic patterns
   def: Pat<(XLenVecI8VT (add GPR:$rs1, GPR:$rs2)), (PADD_B GPR:$rs1, GPR:$rs2)>;
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 08e2b835547ca..8e2832f654fd8 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -43,6 +43,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass {
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
   bool expandMovAddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMERGE(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
 };
 
 char RISCVPostRAExpandPseudo::ID = 0;
@@ -76,6 +77,8 @@ bool RISCVPostRAExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return expandMovImm(MBB, MBBI);
   case RISCV::PseudoMovAddr:
     return expandMovAddr(MBB, MBBI);
+  case RISCV::PseudoMERGE:
+    return expandMERGE(MBB, MBBI);
   default:
     return false;
   }
@@ -118,6 +121,77 @@ bool RISCVPostRAExpandPseudo::expandMovAddr(MachineBasicBlock &MBB,
   return true;
 }
 
+/// Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+                           MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (const MachineOperand &MO :
+       llvm::drop_begin(OldMI.operands(), Desc.getNumOperands())) {
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.add(MO);
+    else
+      DefMI.add(MO);
+  }
+}
+
+// Expand PseudoMERGE to MERGE, MVM, or MVMN.
+bool RISCVPostRAExpandPseudo::expandMERGE(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  if (DstReg == MI.getOperand(3).getReg()) {
+    // Expand to MVMN
+    auto I = BuildMI(MBB, MBBI, DL, TII->get(RISCV::MVMN))
+                 .add(MI.getOperand(0))
+                 .add(MI.getOperand(3))
+                 .add(MI.getOperand(2))
+                 .add(MI.getOperand(1));
+    transferImpOps(*MBBI, I, I);
+  } else if (DstReg == MBBI->getOperand(2).getReg()) {
+    // Expand to MVM
+    auto I = BuildMI(MBB, MBBI, DL, TII->get(RISCV::MVM))
+                 .add(MI.getOperand(0))
+                 .add(MI.getOperand(2))
+                 .add(MI.getOperand(3))
+                 .add(MI.getOperand(1));
+    transferImpOps(*MBBI, I, I);
+  } else if (DstReg == MI.getOperand(1).getReg()) {
+    // Expand to MERGE
+    auto I = BuildMI(MBB, MBBI, DL, TII->get(RISCV::MERGE))
+                 .add(MI.getOperand(0))
+                 .add(MI.getOperand(1))
+                 .add(MI.getOperand(2))
+                 .add(MI.getOperand(3));
+    transferImpOps(*MBBI, I, I);
+  } else {
+    // Use an additional move.
+    RegState RegState =
+        getRenamableRegState(MI.getOperand(1).isRenamable()) |
+        getKillRegState(MI.getOperand(1).isKill() &&
+                        MI.getOperand(1).getReg() !=
+                            MI.getOperand(2).getReg() &&
+                        MI.getOperand(1).getReg() != MI.getOperand(3).getReg());
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(RISCV::ADDI))
+        .addDef(DstReg, getRenamableRegState(MI.getOperand(0).isRenamable()))
+        .addReg(MI.getOperand(1).getReg(), RegState)
+        .addImm(0);
+    auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(RISCV::MERGE))
+                 .add(MI.getOperand(0))
+                 .addReg(DstReg,
+                         RegState::Kill | getRenamableRegState(
+                                              MI.getOperand(0).isRenamable()))
+                 .add(MI.getOperand(2))
+                 .add(MI.getOperand(3));
+    transferImpOps(*MBBI, I, I);
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
 } // end of anonymous namespace
 
 INITIALIZE_PASS(RISCVPostRAExpandPseudo, "riscv-post-ra-expand-pseudo",
diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
index ba59c2d200fa9..e4d4c68109dea 100644
--- a/llvm/test/CodeGen/RISCV/rv32p.ll
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -730,9 +730,7 @@ define void @wmaccu_multiple_uses(i32 %a, i32 %b, i64 %c, ptr %out1, ptr %out2)
 define i32 @merge_i32(i32 %mask, i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: merge_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a2, a0, a2
-; CHECK-NEXT:    andn a0, a1, a0
-; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    merge a0, a1, a2
 ; CHECK-NEXT:    ret
   %and1 = and i32 %mask, %b
   %not = xor i32 %mask, -1
@@ -745,9 +743,7 @@ define i32 @merge_i32(i32 %mask, i32 %a, i32 %b) nounwind {
 define i32 @merge_i32_2(i32 %mask, i32 %b, i32 %a) nounwind {
 ; CHECK-LABEL: merge_i32_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a1, a0, a1
-; CHECK-NEXT:    andn a0, a2, a0
-; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    merge a0, a2, a1
 ; CHECK-NEXT:    ret
   %and1 = and i32 %mask, %b
   %not = xor i32 %mask, -1
@@ -760,9 +756,7 @@ define i32 @merge_i32_2(i32 %mask, i32 %b, i32 %a) nounwind {
 define i32 @mvm_i32(i32 %a, i32 %mask, i32 %b) nounwind {
 ; CHECK-LABEL: mvm_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a2, a1, a2
-; CHECK-NEXT:    andn a0, a0, a1
-; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    mvm a0, a2, a1
 ; CHECK-NEXT:    ret
   %and1 = and i32 %mask, %b
   %not = xor i32 %mask, -1
@@ -775,9 +769,7 @@ define i32 @mvm_i32(i32 %a, i32 %mask, i32 %b) nounwind {
 define i32 @mvm_i32_2(i32 %a, i32 %b, i32 %mask) nounwind {
 ; CHECK-LABEL: mvm_i32_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a1, a2, a1
-; CHECK-NEXT:    andn a0, a0, a2
-; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    mvm a0, a1, a2
 ; CHECK-NEXT:    ret
   %and1 = and i32 %mask, %b
   %not = xor i32 %mask, -1
@@ -790,9 +782,7 @@ define i32 @mvm_i32_2(i32 %a, i32 %b, i32 %mask) nounwind {
 define i32 @mvmn_i32(i32 %b, i32 %mask, i32 %a) nounwind {
 ; CHECK-LABEL: mvmn_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    andn a1, a2, a1
-; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    mvmn a0, a2, a1
 ; CHECK-NEXT:    ret
   %and1 = and i32 %mask, %b
   %not = xor i32 %mask, -1
@@ -805,9 +795,7 @@ define i32 @mvmn_i32(i32 %b, i32 %mask, i32 %a) nounwind {
 define i32 @mvmn_i32_2(i32 %b, i32 %a, i32 %mask) nounwind {
 ; CHECK-LABEL: mvmn_i32_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    andn a1, a1, a2
-; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    mvmn a0, a1, a2
 ; CHECK-NEXT:    ret
   %and1 = and i32 %mask, %b
   %not = xor i32 %mask, -1
@@ -821,12 +809,11 @@ define i32 @mvmn_i32_2(i32 %b, i32 %a, i32 %mask) nounwind {
 define i32 @merge_i32_mv(i32 %mask, i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: merge_i32_mv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a3, a0, a2
-; CHECK-NEXT:    andn a4, a1, a0
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:    merge a3, a1, a2
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    or a3, a3, a4
-; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, a3, a0
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    ret
   %and1 = and i32 %mask, %b
   %not = xor i32 %mask, -1
@@ -842,9 +829,7 @@ define i32 @merge_i32_mv(i32 %mask, i32 %a, i32 %b) nounwind {
 define i32 @merge_xor_i32(i32 %mask, i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: merge_xor_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andn a1, a1, a0
-; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    merge a0, a1, a2
 ; CHECK-NEXT:    ret
   %xor1 = xor i32 %a, %b
   %and = and i32 %xor1, %mask
@@ -856,9 +841,7 @@ define i32 @merge_xor_i32(i32 %mask, i32 %a, i32 %b) nounwind {
 define i32 @mvm_xor_i32(i32 %a, i32 %mask, i32 %b) nounwind {
 ; CHECK-LABEL: mvm_xor_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andn a0, a0, a1
-; CHECK-NEXT:    and a1, a2, a1
-; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    mvm a0, a2, a1
 ; CHECK-NEXT:    ret
   %xor1 = xor i32 %a, %b
   %and = and i32 %xor1, %mask
@@ -870,9 +853,7 @@ define i32 @mvm_xor_i32(i32 %a, i32 %mask, i32 %b) nounwind {
 define i32 @mvmn_xor_i32(i32 %b, i32 %mask, i32 %a) nounwind {
 ; CHECK-LABEL: mvmn_xor_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andn a2, a2, a1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    or a0, a0, a2
+; CHECK-NEXT:    mvmn a0, a2, a1
 ; CHECK-NEXT:    ret
   %xor1 = xor i32 %a, %b
   %and = and i32 %xor1, %mask
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
index db0fea014d32b..53ca8476034a1 100644
--- a/llvm/test/CodeGen/RISCV/rv64p.ll
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -383,9 +383,7 @@ define i128 @srxi_i128(i128 %x) {
 define i64 @merge_i64(i64 %mask, i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: merge_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a2, a0, a2
-; CHECK-NEXT:    andn a0, a1, a0
-; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    merge a0, a1, a2
 ; CHECK-NEXT:    ret
   %and1 = and i64 %mask, %b
   %not = xor i64 %mask, -1
@@ -398,9 +396,7 @@ define i64 @merge_i64(i64 %mask, i64 %a, i64 %b) nounwind {
 define i64 @merge_i64_2(i64 %mask, i64 %b, i64 %a) nounwind {
 ; CHECK-LABEL: merge_i64_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a1, a0, a1
-; CHECK-NEXT:    andn a0, a2, a0
-; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    merge a0, a2, a1
 ; CHECK-NEXT:    ret
   %and1 = and i64 %mask, %b
   %not = xor i64 %mask, -1
@@ -413,9 +409,7 @@ define i64 @merge_i64_2(i64 %mask, i64 %b, i64 %a) nounwind {
 define i64 @mvm_i64(i64 %a, i64 %mask, i64 %b) nounwind {
 ; CHECK-LABEL: mvm_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a2, a1, a2
-; CHECK-NEXT:    andn a0, a0, a1
-; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    mvm a0, a2, a1
 ; CHECK-NEXT:    ret
   %and1 = and i64 %mask, %b
   %not = xor i64 %mask, -1
@@ -428,9 +422,7 @@ define i64 @mvm_i64(i64 %a, i64 %mask, i64 %b) nounwind {
 define i64 @mvm_i64_2(i64 %a, i64 %b, i64 %mask) nounwind {
 ; CHECK-LABEL: mvm_i64_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a1, a2, a1
-; CHECK-NEXT:    andn a0, a0, a2
-; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    mvm a0, a1, a2
 ; CHECK-NEXT:    ret
   %and1 = and i64 %mask, %b
   %not = xor i64 %mask, -1
@@ -443,9 +435,7 @@ define i64 @mvm_i64_2(i64 %a, i64 %b, i64 %mask) nounwind {
 define i64 @mvmn_i64(i64 %b, i64 %mask, i64 %a) nounwind {
 ; CHECK-LABEL: mvmn_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    andn a1, a2, a1
-; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    mvmn a0, a2, a1
 ; CHECK-NEXT:    ret
   %and1 = and i64 %mask, %b
   %not = xor i64 %mask, -1
@@ -458,9 +448,7 @@ define i64 @mvmn_i64(i64 %b, i64 %mask, i64 %a) nounwind {
 define i64 @mvmn_i64_2(i64 %b, i64 %a, i64 %mask) nounwind {
 ; CHECK-LABEL: mvmn_i64_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    andn a1, a1, a2
-; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    mvmn a0, a1, a2
 ; CHECK-NEXT:    ret
   %and1 = and i64 %mask, %b
   %not = xor i64 %mask, -1
@@ -474,12 +462,11 @@ define i64 @mvmn_i64_2(i64 %b, i64 %a, i64 %mask) nounwind {
 define i64 @merge_i64_mv(i64 %mask, i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: merge_i64_mv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a3, a0, a2
-; CHECK-NEXT:    andn a4, a1, a0
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:    merge a3, a1, a2
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    or a3, a3, a4
-; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, a3, a0
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    ret
   %and1 = and i64 %mask, %b
   %not = xor i64 %mask, -1
@@ -495,9 +482,7 @@ define i64 @merge_i64_mv(i64 %mask, i64 %a, i64 %b) nounwind {
 define i64 @merge_xor_i64(i64 %mask, i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: merge_xor_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andn a1, a1, a0
-; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    merge a0, a1, a2
 ; CHECK-NEXT:    ret
   %xor1 = xor i64 %a, %b
   %and = and i64 %xor1, %mask
@@ -509,9 +494,7 @@ define i64 @merge_xor_i64(i64 %mask, i64 %a, i64 %b) nounwind {
 define i64 @mvm_xor_i64(i64 %a, i64 %mask, i64 %b) nounwind {
 ; CHECK-LABEL: mvm_xor_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andn a0, a0, a1
-; CHECK-NEXT:    and a1, a2, a1
-; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    mvm a0, a2, a1
 ; CHECK-NEXT:    ret
   %xor1 = xor i64 %a, %b
   %and = and i64 %xor1, %mask
@@ -523,9 +506,7 @@ define i64 @mvm_xor_i64(i64 %a, i64 %mask, i64 %b) nounwind {
 define i64 @mvmn_xor_i64(i64 %b, i64 %mask, i64 %a) nounwind {
 ; CHECK-LABEL: mvmn_xor_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andn a2, a2, a1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    or a0, a0, a2
+; CHECK-NEXT:    mvmn a0, a2, a1
 ; CHECK-NEXT:    ret
   %xor1 = xor i64 %a, %b
   %and = and i64 %xor1, %mask