[llvm] 56ca11e - [RISCV] Add an MIR pass to replace redundant sext.w instructions with copies.

Thu Jan 6 08:23:57 PST 2022

Author: Craig Topper
Date: 2022-01-06T08:23:42-08:00
New Revision: 56ca11e31e6ad4fda6476ee65537529963a2708e

URL: https://github.com/llvm/llvm-project/commit/56ca11e31e6ad4fda6476ee65537529963a2708e
DIFF: https://github.com/llvm/llvm-project/commit/56ca11e31e6ad4fda6476ee65537529963a2708e.diff

LOG: [RISCV] Add an MIR pass to replace redundant sext.w instructions with copies.

Function calls and compare instructions tend to cause sext.w
instructions to be inserted. If we make good use of W instructions,
these operations can often end up being redundant. We don't always
detect these during SelectionDAG due to things like phis. There also
some cases caused by failure to turn extload into sextload in
SelectionDAG. extload selects to LW allowing later sext.ws to become
redundant.

This patch adds a pass that examines the input of sext.w instructions trying
to determine if it is already sign extended. Either by finding a
W instruction, other instructions that produce a sign extended result,
or looking through instructions that propagate sign bits. It uses
a worklist and visited set to search as far back as necessary.

Reviewed By: asb, kito-cheng

Differential Revision: https://reviews.llvm.org/D116397

Added: 
    llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
    llvm/test/CodeGen/RISCV/sextw-removal.ll

Modified: 
    llvm/lib/Target/RISCV/CMakeLists.txt
    llvm/lib/Target/RISCV/RISCV.h
    llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
    llvm/test/CodeGen/RISCV/atomic-rmw.ll
    llvm/test/CodeGen/RISCV/atomic-signext.ll
    llvm/test/CodeGen/RISCV/rv64zbs.ll
    llvm/test/CodeGen/RISCV/split-offsets.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 8eb3a9abc5fc..d1b0863d439b 100644

--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_target(RISCVCodeGen
   RISCVMergeBaseOffset.cpp
   RISCVRegisterBankInfo.cpp
   RISCVRegisterInfo.cpp
+  RISCVSExtWRemoval.cpp
   RISCVSubtarget.cpp
   RISCVTargetMachine.cpp
   RISCVTargetObjectFile.cpp

diff  --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index b415c9f35e7f..03462240fd93 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -40,6 +40,9 @@ FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
 FunctionPass *createRISCVGatherScatterLoweringPass();
 void initializeRISCVGatherScatterLoweringPass(PassRegistry &);
 
+FunctionPass *createRISCVSExtWRemovalPass();
+void initializeRISCVSExtWRemovalPass(PassRegistry &);
+
 FunctionPass *createRISCVMergeBaseOffsetOptPass();
 void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 

diff  --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
new file mode 100644
index 000000000000..c7df709be5f5
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
@@ -0,0 +1,266 @@
+//===-------------- RISCVSExtWRemoval.cpp - MI sext.w Removal -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass removes unneeded sext.w instructions at the MI level.
+//
+//===---------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-sextw-removal"
+
+STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
+
+static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal",
+                                         cl::desc("Disable removal of sext.w"),
+                                         cl::init(false), cl::Hidden);
+namespace {
+
+class RISCVSExtWRemoval : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RISCVSExtWRemoval() : MachineFunctionPass(ID) {
+    initializeRISCVSExtWRemovalPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "RISCV sext.w Removal"; }
+};
+
+} // end anonymous namespace
+
+char RISCVSExtWRemoval::ID = 0;
+INITIALIZE_PASS(RISCVSExtWRemoval, DEBUG_TYPE, "RISCV sext.w Removal", false,
+                false)
+
+FunctionPass *llvm::createRISCVSExtWRemovalPass() {
+  return new RISCVSExtWRemoval();
+}
+
+// This function returns true if the machine instruction always outputs a value
+// where bits 63:32 match bit 31.
+// TODO: Allocate a bit in TSFlags for the W instructions?
+// TODO: Add other W instructions.
+static bool isSignExtendingOpW(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case RISCV::LUI:
+  case RISCV::LW:
+  case RISCV::ADDW:
+  case RISCV::ADDIW:
+  case RISCV::SUBW:
+  case RISCV::MULW:
+  case RISCV::SLLW:
+  case RISCV::SLLIW:
+  case RISCV::SRAW:
+  case RISCV::SRAIW:
+  case RISCV::SRLW:
+  case RISCV::SRLIW:
+  case RISCV::DIVW:
+  case RISCV::DIVUW:
+  case RISCV::REMW:
+  case RISCV::REMUW:
+  case RISCV::ROLW:
+  case RISCV::RORW:
+  case RISCV::RORIW:
+  case RISCV::CLZW:
+  case RISCV::CTZW:
+  case RISCV::CPOPW:
+  case RISCV::FCVT_W_H:
+  case RISCV::FCVT_WU_H:
+  case RISCV::FCVT_W_S:
+  case RISCV::FCVT_WU_S:
+  case RISCV::FCVT_W_D:
+  case RISCV::FCVT_WU_D:
+  // The following aren't W instructions, but are either sign extended from a
+  // smaller size or put zeros in bits 63:31.
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::SEXTB:
+  case RISCV::SEXTH:
+  case RISCV::ZEXTH_RV64:
+    return true;
+  }
+
+  // The LI pattern ADDI rd, X0, imm is sign extended.
+  if (MI.getOpcode() == RISCV::ADDI && MI.getOperand(1).isReg() &&
+      MI.getOperand(1).getReg() == RISCV::X0)
+    return true;
+
+  // An ANDI with an 11 bit immediate will zero bits 63:11.
+  if (MI.getOpcode() == RISCV::ANDI && isUInt<11>(MI.getOperand(2).getImm()))
+    return true;
+
+  // Copying from X0 produces zero.
+  if (MI.getOpcode() == RISCV::COPY && MI.getOperand(1).getReg() == RISCV::X0)
+    return true;
+
+  return false;
+}
+
+static bool isSignExtendedW(const MachineInstr &OrigMI,
+                            MachineRegisterInfo &MRI) {
+
+  SmallPtrSet<const MachineInstr *, 4> Visited;
+  SmallVector<const MachineInstr *, 4> Worklist;
+
+  Worklist.push_back(&OrigMI);
+
+  while (!Worklist.empty()) {
+    const MachineInstr *MI = Worklist.pop_back_val();
+
+    // If we already visited this instruction, we don't need to check it again.
+    if (!Visited.insert(MI).second)
+      continue;
+
+    // If this is a sign extending operation we don't need to look any further.
+    if (isSignExtendingOpW(*MI))
+      continue;
+
+    // Is this an instruction that propagates sign extend.
+    switch (MI->getOpcode()) {
+    default:
+      // Unknown opcode, give up.
+      return false;
+    case RISCV::COPY: {
+      Register SrcReg = MI->getOperand(1).getReg();
+
+      // TODO: Handle arguments and returns from calls?
+
+      // If this is a copy from another register, check its source instruction.
+      if (!SrcReg.isVirtual())
+        return false;
+      const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+      if (!SrcMI)
+        return false;
+
+      // Add SrcMI to the worklist.
+      Worklist.push_back(SrcMI);
+      break;
+    }
+    case RISCV::ANDI:
+    case RISCV::ORI:
+    case RISCV::XORI: {
+      // Logical operations use a sign extended 12-bit immediate. We just need
+      // to check if the other operand is sign extended.
+      Register SrcReg = MI->getOperand(1).getReg();
+      if (!SrcReg.isVirtual())
+        return false;
+      const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+      if (!SrcMI)
+        return false;
+
+      // Add SrcMI to the worklist.
+      Worklist.push_back(SrcMI);
+      break;
+    }
+    case RISCV::AND:
+    case RISCV::OR:
+    case RISCV::XOR:
+    case RISCV::ANDN:
+    case RISCV::ORN:
+    case RISCV::XNOR:
+    case RISCV::MAX:
+    case RISCV::MAXU:
+    case RISCV::MIN:
+    case RISCV::MINU:
+    case RISCV::PHI: {
+      // If all incoming values are sign-extended, the output of AND, OR, XOR,
+      // MIN, MAX, or PHI is also sign-extended.
+
+      // The input registers for PHI are operand 1, 3, ...
+      // The input registers for others are operand 1 and 2.
+      unsigned E = 3, D = 1;
+      if (MI->getOpcode() == RISCV::PHI) {
+        E = MI->getNumOperands();
+        D = 2;
+      }
+
+      for (unsigned I = 1; I != E; I += D) {
+        if (!MI->getOperand(I).isReg())
+          return false;
+
+        Register SrcReg = MI->getOperand(I).getReg();
+        if (!SrcReg.isVirtual())
+          return false;
+        const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+        if (!SrcMI)
+          return false;
+
+        // Add SrcMI to the worklist.
+        Worklist.push_back(SrcMI);
+      }
+
+      break;
+    }
+    }
+  }
+
+  // If we get here, then every node we visited produces a sign extended value
+  // or propagated sign extended values. So the result must be sign extended.
+  return true;
+}
+
+bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()) || DisableSExtWRemoval)
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+
+  if (!ST.is64Bit())
+    return false;
+
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto I = MBB.begin(), IE = MBB.end(); I != IE;) {
+      MachineInstr *MI = &*I++;
+
+      // We're looking for the sext.w pattern ADDIW rd, rs1, 0.
+      if (MI->getOpcode() != RISCV::ADDIW || !MI->getOperand(2).isImm() ||
+          MI->getOperand(2).getImm() != 0 || !MI->getOperand(1).isReg())
+        continue;
+
+      // Input should be a virtual register.
+      Register SrcReg = MI->getOperand(1).getReg();
+      if (!SrcReg.isVirtual())
+        continue;
+
+      const MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg);
+      if (!isSignExtendedW(SrcMI, MRI))
+        continue;
+
+      Register DstReg = MI->getOperand(0).getReg();
+      if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg)))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
+      MRI.replaceRegWith(DstReg, SrcReg);
+      MRI.clearKillFlags(SrcReg);
+      MI->eraseFromParent();
+      ++NumRemovedSExtW;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b421eba8d442..db5e2f1eeb6f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -39,6 +39,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeGlobalISel(*PR);
   initializeRISCVGatherScatterLoweringPass(*PR);
   initializeRISCVMergeBaseOffsetOptPass(*PR);
+  initializeRISCVSExtWRemovalPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
   initializeRISCVInsertVSETVLIPass(*PR);
 }
@@ -140,6 +141,7 @@ class RISCVPassConfig : public TargetPassConfig {
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
   void addPreSched2() override;
+  void addMachineSSAOptimization() override;
   void addPreRegAlloc() override;
 };
 } // namespace
@@ -194,6 +196,13 @@ void RISCVPassConfig::addPreEmitPass2() {
   addPass(createRISCVExpandAtomicPseudoPass());
 }
 
+void RISCVPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+
+  if (TM->getTargetTriple().getArch() == Triple::riscv64)
+    addPass(createRISCVSExtWRemovalPass());
+}
+
 void RISCVPassConfig::addPreRegAlloc() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createRISCVMergeBaseOffsetOptPass());

diff  --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index 4c897b29ef1e..c311f2477dcf 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -11134,9 +11134,8 @@ define i32 @atomicrmw_max_i32_monotonic(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB145_4
 ; RV64I-NEXT:  .LBB145_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a0, .LBB145_1
+; RV64I-NEXT:    blt s1, a3, .LBB145_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB145_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11224,9 +11223,8 @@ define i32 @atomicrmw_max_i32_acquire(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB146_4
 ; RV64I-NEXT:  .LBB146_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a0, .LBB146_1
+; RV64I-NEXT:    blt s1, a3, .LBB146_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB146_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11314,9 +11312,8 @@ define i32 @atomicrmw_max_i32_release(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB147_4
 ; RV64I-NEXT:  .LBB147_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a0, .LBB147_1
+; RV64I-NEXT:    blt s1, a3, .LBB147_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB147_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11404,9 +11401,8 @@ define i32 @atomicrmw_max_i32_acq_rel(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB148_4
 ; RV64I-NEXT:  .LBB148_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a0, .LBB148_1
+; RV64I-NEXT:    blt s1, a3, .LBB148_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB148_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11494,9 +11490,8 @@ define i32 @atomicrmw_max_i32_seq_cst(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB149_4
 ; RV64I-NEXT:  .LBB149_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a0, .LBB149_1
+; RV64I-NEXT:    blt s1, a3, .LBB149_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB149_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11584,9 +11579,8 @@ define i32 @atomicrmw_min_i32_monotonic(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB150_4
 ; RV64I-NEXT:  .LBB150_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a0, .LBB150_1
+; RV64I-NEXT:    bge s1, a3, .LBB150_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB150_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11674,9 +11668,8 @@ define i32 @atomicrmw_min_i32_acquire(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB151_4
 ; RV64I-NEXT:  .LBB151_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a0, .LBB151_1
+; RV64I-NEXT:    bge s1, a3, .LBB151_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB151_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11764,9 +11757,8 @@ define i32 @atomicrmw_min_i32_release(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB152_4
 ; RV64I-NEXT:  .LBB152_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a0, .LBB152_1
+; RV64I-NEXT:    bge s1, a3, .LBB152_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB152_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11854,9 +11846,8 @@ define i32 @atomicrmw_min_i32_acq_rel(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB153_4
 ; RV64I-NEXT:  .LBB153_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a0, .LBB153_1
+; RV64I-NEXT:    bge s1, a3, .LBB153_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB153_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -11944,9 +11935,8 @@ define i32 @atomicrmw_min_i32_seq_cst(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB154_4
 ; RV64I-NEXT:  .LBB154_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a0, .LBB154_1
+; RV64I-NEXT:    bge s1, a3, .LBB154_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB154_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12034,9 +12024,8 @@ define i32 @atomicrmw_umax_i32_monotonic(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB155_4
 ; RV64I-NEXT:  .LBB155_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a0, .LBB155_1
+; RV64I-NEXT:    bltu s1, a3, .LBB155_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB155_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12124,9 +12113,8 @@ define i32 @atomicrmw_umax_i32_acquire(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB156_4
 ; RV64I-NEXT:  .LBB156_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a0, .LBB156_1
+; RV64I-NEXT:    bltu s1, a3, .LBB156_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB156_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12214,9 +12202,8 @@ define i32 @atomicrmw_umax_i32_release(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB157_4
 ; RV64I-NEXT:  .LBB157_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a0, .LBB157_1
+; RV64I-NEXT:    bltu s1, a3, .LBB157_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB157_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12304,9 +12291,8 @@ define i32 @atomicrmw_umax_i32_acq_rel(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB158_4
 ; RV64I-NEXT:  .LBB158_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a0, .LBB158_1
+; RV64I-NEXT:    bltu s1, a3, .LBB158_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB158_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12394,9 +12380,8 @@ define i32 @atomicrmw_umax_i32_seq_cst(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB159_4
 ; RV64I-NEXT:  .LBB159_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a0, .LBB159_1
+; RV64I-NEXT:    bltu s1, a3, .LBB159_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB159_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12484,9 +12469,8 @@ define i32 @atomicrmw_umin_i32_monotonic(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB160_4
 ; RV64I-NEXT:  .LBB160_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a0, .LBB160_1
+; RV64I-NEXT:    bgeu s1, a3, .LBB160_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB160_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12574,9 +12558,8 @@ define i32 @atomicrmw_umin_i32_acquire(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB161_4
 ; RV64I-NEXT:  .LBB161_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a0, .LBB161_1
+; RV64I-NEXT:    bgeu s1, a3, .LBB161_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB161_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12664,9 +12647,8 @@ define i32 @atomicrmw_umin_i32_release(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB162_4
 ; RV64I-NEXT:  .LBB162_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a0, .LBB162_1
+; RV64I-NEXT:    bgeu s1, a3, .LBB162_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB162_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12754,9 +12736,8 @@ define i32 @atomicrmw_umin_i32_acq_rel(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB163_4
 ; RV64I-NEXT:  .LBB163_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a0, .LBB163_1
+; RV64I-NEXT:    bgeu s1, a3, .LBB163_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB163_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
@@ -12844,9 +12825,8 @@ define i32 @atomicrmw_umin_i32_seq_cst(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB164_4
 ; RV64I-NEXT:  .LBB164_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a0, .LBB164_1
+; RV64I-NEXT:    bgeu s1, a3, .LBB164_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB164_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2

diff  --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index a5adcf5e4802..53bcd15c9ac9 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -2565,15 +2565,14 @@ define signext i32 @atomicrmw_max_i32_monotonic(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB32_4
 ; RV64I-NEXT:  .LBB32_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a0, .LBB32_1
+; RV64I-NEXT:    blt s1, a3, .LBB32_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB32_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
 ; RV64I-NEXT:    j .LBB32_1
 ; RV64I-NEXT:  .LBB32_4: # %atomicrmw.end
-; RV64I-NEXT:    sext.w a0, a3
+; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -2655,15 +2654,14 @@ define signext i32 @atomicrmw_min_i32_monotonic(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB33_4
 ; RV64I-NEXT:  .LBB33_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a0, .LBB33_1
+; RV64I-NEXT:    bge s1, a3, .LBB33_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB33_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
 ; RV64I-NEXT:    j .LBB33_1
 ; RV64I-NEXT:  .LBB33_4: # %atomicrmw.end
-; RV64I-NEXT:    sext.w a0, a3
+; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -2745,15 +2743,14 @@ define signext i32 @atomicrmw_umax_i32_monotonic(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB34_4
 ; RV64I-NEXT:  .LBB34_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a0, .LBB34_1
+; RV64I-NEXT:    bltu s1, a3, .LBB34_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB34_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
 ; RV64I-NEXT:    j .LBB34_1
 ; RV64I-NEXT:  .LBB34_4: # %atomicrmw.end
-; RV64I-NEXT:    sext.w a0, a3
+; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -2835,15 +2832,14 @@ define signext i32 @atomicrmw_umin_i32_monotonic(i32 *%a, i32 %b) nounwind {
 ; RV64I-NEXT:    bnez a0, .LBB35_4
 ; RV64I-NEXT:  .LBB35_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sext.w a0, a3
 ; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a0, .LBB35_1
+; RV64I-NEXT:    bgeu s1, a3, .LBB35_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB35_2 Depth=1
 ; RV64I-NEXT:    mv a2, s2
 ; RV64I-NEXT:    j .LBB35_1
 ; RV64I-NEXT:  .LBB35_4: # %atomicrmw.end
-; RV64I-NEXT:    sext.w a0, a3
+; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64zbs.ll
index 98afc93b981b..4b74780c28c2 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbs.ll
@@ -57,7 +57,6 @@ define signext i32 @bclr_i32_load(i32* %p, i32 signext %b) nounwind {
 ; RV64I-NEXT:    sllw a1, a2, a1
 ; RV64I-NEXT:    not a1, a1
 ; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBS-LABEL: bclr_i32_load:
@@ -67,7 +66,6 @@ define signext i32 @bclr_i32_load(i32* %p, i32 signext %b) nounwind {
 ; RV64ZBS-NEXT:    sllw a1, a2, a1
 ; RV64ZBS-NEXT:    not a1, a1
 ; RV64ZBS-NEXT:    and a0, a1, a0
-; RV64ZBS-NEXT:    sext.w a0, a0
 ; RV64ZBS-NEXT:    ret
   %a = load i32, i32* %p
   %shl = shl i32 1, %b
@@ -161,7 +159,6 @@ define signext i32 @bset_i32_load(i32* %p, i32 signext %b) nounwind {
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    sllw a1, a2, a1
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBS-LABEL: bset_i32_load:
@@ -170,7 +167,6 @@ define signext i32 @bset_i32_load(i32* %p, i32 signext %b) nounwind {
 ; RV64ZBS-NEXT:    li a2, 1
 ; RV64ZBS-NEXT:    sllw a1, a2, a1
 ; RV64ZBS-NEXT:    or a0, a1, a0
-; RV64ZBS-NEXT:    sext.w a0, a0
 ; RV64ZBS-NEXT:    ret
   %a = load i32, i32* %p
   %shl = shl i32 1, %b
@@ -292,7 +288,6 @@ define signext i32 @binv_i32_load(i32* %p, i32 signext %b) nounwind {
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    sllw a1, a2, a1
 ; RV64I-NEXT:    xor a0, a1, a0
-; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBS-LABEL: binv_i32_load:
@@ -301,7 +296,6 @@ define signext i32 @binv_i32_load(i32* %p, i32 signext %b) nounwind {
 ; RV64ZBS-NEXT:    li a2, 1
 ; RV64ZBS-NEXT:    sllw a1, a2, a1
 ; RV64ZBS-NEXT:    xor a0, a1, a0
-; RV64ZBS-NEXT:    sext.w a0, a0
 ; RV64ZBS-NEXT:    ret
   %a = load i32, i32* %p
   %shl = shl i32 1, %b

diff  --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
new file mode 100644
index 000000000000..64fc0bf09a66
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64I
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+experimental-zbb,+f \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64ZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+experimental-zbb,+f \
+; RUN:   -riscv-disable-sextw-removal | FileCheck %s --check-prefix=NOREMOVAL
+
+define void @test1(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    mv s0, a1
+; CHECK-NEXT:    sraw s1, a0, a1
+; CHECK-NEXT:  .LBB0_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    mv a0, s1
+; CHECK-NEXT:    call bar at plt
+; CHECK-NEXT:    sllw s1, s1, s0
+; CHECK-NEXT:    bnez a0, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 32
+; CHECK-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test1:
+; NOREMOVAL:       # %bb.0: # %bb
+; NOREMOVAL-NEXT:    addi sp, sp, -32
+; NOREMOVAL-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; NOREMOVAL-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; NOREMOVAL-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; NOREMOVAL-NEXT:    mv s0, a1
+; NOREMOVAL-NEXT:    sraw s1, a0, a1
+; NOREMOVAL-NEXT:  .LBB0_1: # %bb2
+; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
+; NOREMOVAL-NEXT:    sext.w a0, s1
+; NOREMOVAL-NEXT:    call bar at plt
+; NOREMOVAL-NEXT:    sllw s1, s1, s0
+; NOREMOVAL-NEXT:    bnez a0, .LBB0_1
+; NOREMOVAL-NEXT:  # %bb.2: # %bb7
+; NOREMOVAL-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; NOREMOVAL-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; NOREMOVAL-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; NOREMOVAL-NEXT:    addi sp, sp, 32
+; NOREMOVAL-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare signext i32 @bar(i32 signext)
+
+; The load here will be an anyext load in isel and sext.w will be emitted for
+; the ret. Make sure we can look through logic ops to prove the sext.w is
+; unnecessary.
+define signext i32 @test2(i32* %p, i32 signext %b) nounwind {
+; RV64I-LABEL: test2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sllw a1, a2, a1
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: test2:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    lw a0, 0(a0)
+; RV64ZBB-NEXT:    li a2, 1
+; RV64ZBB-NEXT:    sllw a1, a2, a1
+; RV64ZBB-NEXT:    andn a0, a0, a1
+; RV64ZBB-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test2:
+; NOREMOVAL:       # %bb.0:
+; NOREMOVAL-NEXT:    lw a0, 0(a0)
+; NOREMOVAL-NEXT:    li a2, 1
+; NOREMOVAL-NEXT:    sllw a1, a2, a1
+; NOREMOVAL-NEXT:    andn a0, a0, a1
+; NOREMOVAL-NEXT:    sext.w a0, a0
+; NOREMOVAL-NEXT:    ret
+  %a = load i32, i32* %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @test3(i32* %p, i32 signext %b) nounwind {
+; RV64I-LABEL: test3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sllw a1, a2, a1
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: test3:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    lw a0, 0(a0)
+; RV64ZBB-NEXT:    li a2, 1
+; RV64ZBB-NEXT:    sllw a1, a2, a1
+; RV64ZBB-NEXT:    orn a0, a0, a1
+; RV64ZBB-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test3:
+; NOREMOVAL:       # %bb.0:
+; NOREMOVAL-NEXT:    lw a0, 0(a0)
+; NOREMOVAL-NEXT:    li a2, 1
+; NOREMOVAL-NEXT:    sllw a1, a2, a1
+; NOREMOVAL-NEXT:    orn a0, a0, a1
+; NOREMOVAL-NEXT:    sext.w a0, a0
+; NOREMOVAL-NEXT:    ret
+  %a = load i32, i32* %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = or i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @test4(i32* %p, i32 signext %b) nounwind {
+; RV64I-LABEL: test4:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sllw a1, a2, a1
+; RV64I-NEXT:    xor a0, a1, a0
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: test4:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    lw a0, 0(a0)
+; RV64ZBB-NEXT:    li a2, 1
+; RV64ZBB-NEXT:    sllw a1, a2, a1
+; RV64ZBB-NEXT:    xnor a0, a1, a0
+; RV64ZBB-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test4:
+; NOREMOVAL:       # %bb.0:
+; NOREMOVAL-NEXT:    lw a0, 0(a0)
+; NOREMOVAL-NEXT:    li a2, 1
+; NOREMOVAL-NEXT:    sllw a1, a2, a1
+; NOREMOVAL-NEXT:    xnor a0, a1, a0
+; NOREMOVAL-NEXT:    sext.w a0, a0
+; NOREMOVAL-NEXT:    ret
+  %a = load i32, i32* %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = xor i32 %neg, %a
+  ret i32 %and1
+}
+
+; Make sure we don't put a sext.w before bar when using cpopw.
+define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
+; RV64I-LABEL: test5:
+; RV64I:       # %bb.0: # %bb
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sraw a0, a0, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw s2, a1, 1365
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addiw s1, a1, 819
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw s3, a1, -241
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw s0, a1, 257
+; RV64I-NEXT:  .LBB4_1: # %bb2
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    call bar at plt
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    and a0, a0, s2
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    and a2, a0, s1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, s1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, s3
+; RV64I-NEXT:    mulw a0, a0, s0
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    bnez a1, .LBB4_1
+; RV64I-NEXT:  # %bb.2: # %bb7
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: test5:
+; RV64ZBB:       # %bb.0: # %bb
+; RV64ZBB-NEXT:    addi sp, sp, -16
+; RV64ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    sraw a0, a0, a1
+; RV64ZBB-NEXT:  .LBB4_1: # %bb2
+; RV64ZBB-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64ZBB-NEXT:    call bar at plt
+; RV64ZBB-NEXT:    mv a1, a0
+; RV64ZBB-NEXT:    cpopw a0, a0
+; RV64ZBB-NEXT:    bnez a1, .LBB4_1
+; RV64ZBB-NEXT:  # %bb.2: # %bb7
+; RV64ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    addi sp, sp, 16
+; RV64ZBB-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test5:
+; NOREMOVAL:       # %bb.0: # %bb
+; NOREMOVAL-NEXT:    addi sp, sp, -16
+; NOREMOVAL-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; NOREMOVAL-NEXT:    sraw a1, a0, a1
+; NOREMOVAL-NEXT:  .LBB4_1: # %bb2
+; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
+; NOREMOVAL-NEXT:    sext.w a0, a1
+; NOREMOVAL-NEXT:    call bar at plt
+; NOREMOVAL-NEXT:    cpopw a1, a0
+; NOREMOVAL-NEXT:    bnez a0, .LBB4_1
+; NOREMOVAL-NEXT:  # %bb.2: # %bb7
+; NOREMOVAL-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; NOREMOVAL-NEXT:    addi sp, sp, 16
+; NOREMOVAL-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = tail call i32 @llvm.ctpop.i32(i32 %i4)
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+
+define void @test6(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sraw a0, a0, a1
+; CHECK-NEXT:    fmv.w.x ft0, zero
+; CHECK-NEXT:    fsw ft0, 4(sp) # 4-byte Folded Spill
+; CHECK-NEXT:  .LBB5_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    call baz at plt
+; CHECK-NEXT:    fmv.w.x ft0, a0
+; CHECK-NEXT:    flw ft1, 4(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    feq.s a1, ft0, ft1
+; CHECK-NEXT:    fcvt.w.s a0, ft0, rtz
+; CHECK-NEXT:    beqz a1, .LBB5_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test6:
+; NOREMOVAL:       # %bb.0: # %bb
+; NOREMOVAL-NEXT:    addi sp, sp, -16
+; NOREMOVAL-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; NOREMOVAL-NEXT:    sraw a0, a0, a1
+; NOREMOVAL-NEXT:    fmv.w.x ft0, zero
+; NOREMOVAL-NEXT:    fsw ft0, 4(sp) # 4-byte Folded Spill
+; NOREMOVAL-NEXT:  .LBB5_1: # %bb2
+; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
+; NOREMOVAL-NEXT:    sext.w a0, a0
+; NOREMOVAL-NEXT:    call baz at plt
+; NOREMOVAL-NEXT:    fmv.w.x ft0, a0
+; NOREMOVAL-NEXT:    flw ft1, 4(sp) # 4-byte Folded Reload
+; NOREMOVAL-NEXT:    feq.s a1, ft0, ft1
+; NOREMOVAL-NEXT:    fcvt.w.s a0, ft0, rtz
+; NOREMOVAL-NEXT:    beqz a1, .LBB5_1
+; NOREMOVAL-NEXT:  # %bb.2: # %bb7
+; NOREMOVAL-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; NOREMOVAL-NEXT:    addi sp, sp, 16
+; NOREMOVAL-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call float @baz(i32 signext %i3)
+  %i5 = fptosi float %i4 to i32
+  %i6 = fcmp oeq float %i4, zeroinitializer
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+declare float @baz(i32 signext %i3)

diff  --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
index 2587cc11bad7..d08157b824c1 100644
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -83,8 +83,7 @@ define void @test2([65536 x i32]** %sp, [65536 x i32]* %t, i32 %n) {
 ; RV64I-NEXT:    add a0, a1, a5
 ; RV64I-NEXT:    add a1, a4, a5
 ; RV64I-NEXT:    sext.w a2, a2
-; RV64I-NEXT:    sext.w a4, a3
-; RV64I-NEXT:    bge a4, a2, .LBB1_2
+; RV64I-NEXT:    bge a3, a2, .LBB1_2
 ; RV64I-NEXT:  .LBB1_1: # %while_body
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    addiw a4, a3, 1
@@ -93,8 +92,7 @@ define void @test2([65536 x i32]** %sp, [65536 x i32]* %t, i32 %n) {
 ; RV64I-NEXT:    sw a4, 0(a0)
 ; RV64I-NEXT:    sw a3, 4(a0)
 ; RV64I-NEXT:    mv a3, a4
-; RV64I-NEXT:    sext.w a4, a3
-; RV64I-NEXT:    blt a4, a2, .LBB1_1
+; RV64I-NEXT:    blt a3, a2, .LBB1_1
 ; RV64I-NEXT:  .LBB1_2: # %while_end
 ; RV64I-NEXT:    ret
 entry: