[llvm] [RISCV] Add short forward branch support for `lb`, `lbu`, `lh`, `lhu`, `lw`, `lwu` and `ld` (PR #170829)

via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 9 20:21:11 PST 2025


https://github.com/hchandel updated https://github.com/llvm/llvm-project/pull/170829

>From 842d44e2876d2f47cc618c2bf1d0db70a5c71885 Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Tue, 18 Nov 2025 16:35:10 +0530
Subject: [PATCH 01/11] [RISCV] Add short forward branch support for lb, lh,
 lhu, lbu, and lw

Change-Id: Id95f1887590cccce6e0884703e9c46ca08864efa
---
 .../Target/RISCV/RISCVExpandPseudoInsts.cpp   |  10 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        |   6 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |  50 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td    |  16 +
 llvm/test/CodeGen/RISCV/features-info.ll      |   1 +
 .../RISCV/short-forward-branch-opt-load.ll    | 755 ++++++++++++++++++
 6 files changed, 835 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll

diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 60e0afdd99912..04394c37b6bf6 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -133,6 +133,11 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoCCMINU:
   case RISCV::PseudoCCMUL:
   case RISCV::PseudoCCLUI:
+  case RISCV::PseudoCCLB:
+  case RISCV::PseudoCCLH:
+  case RISCV::PseudoCCLW:
+  case RISCV::PseudoCCLHU:
+  case RISCV::PseudoCCLBU:
   case RISCV::PseudoCCQC_LI:
   case RISCV::PseudoCCQC_E_LI:
   case RISCV::PseudoCCADDW:
@@ -243,6 +248,11 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     case RISCV::PseudoCCMINU:  NewOpc = RISCV::MINU;  break;
     case RISCV::PseudoCCMUL:   NewOpc = RISCV::MUL;   break;
     case RISCV::PseudoCCLUI:   NewOpc = RISCV::LUI;   break;
+    case RISCV::PseudoCCLB:    NewOpc = RISCV::LB;    break;
+    case RISCV::PseudoCCLH:    NewOpc = RISCV::LH;    break;
+    case RISCV::PseudoCCLW:    NewOpc = RISCV::LW;    break;
+    case RISCV::PseudoCCLHU:   NewOpc = RISCV::LHU;   break;
+    case RISCV::PseudoCCLBU:   NewOpc = RISCV::LBU;   break;
     case RISCV::PseudoCCQC_LI:  NewOpc = RISCV::QC_LI;   break;
     case RISCV::PseudoCCQC_E_LI: NewOpc = RISCV::QC_E_LI;   break;
     case RISCV::PseudoCCADDI:  NewOpc = RISCV::ADDI;  break;
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 0b964c4808d8a..7b21f6e1cefe0 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1864,6 +1864,12 @@ def TuneShortForwardBranchIMul
                        "true", "Enable short forward branch optimization for mul instruction",
                        [TuneShortForwardBranchOpt]>;
 
+
+def TuneShortForwardBranchILoad
+    : SubtargetFeature<"short-forward-branch-i-load", "HasShortForwardBranchILoad",
+                       "true", "Enable short forward branch optimization for load instructions",
+                       [TuneShortForwardBranchOpt]>;
+
 // Some subtargets require a S2V transfer buffer to move scalars into vectors.
 // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure.
 def TuneNoSinkSplatOperands
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index fb914e97e2229..d6953c24a8955 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1708,6 +1708,11 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
   case RISCV::MINU:  return RISCV::PseudoCCMINU;
   case RISCV::MUL:   return RISCV::PseudoCCMUL;
   case RISCV::LUI:   return RISCV::PseudoCCLUI;
+  case RISCV::LB:    return RISCV::PseudoCCLB;
+  case RISCV::LBU:   return RISCV::PseudoCCLBU;
+  case RISCV::LH:    return RISCV::PseudoCCLH;
+  case RISCV::LHU:   return RISCV::PseudoCCLHU;
+  case RISCV::LW:    return RISCV::PseudoCCLW;
   case RISCV::QC_LI:   return RISCV::PseudoCCQC_LI;
   case RISCV::QC_E_LI:   return RISCV::PseudoCCQC_E_LI;
 
@@ -1747,7 +1752,8 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
 static MachineInstr *canFoldAsPredicatedOp(Register Reg,
                                            const MachineRegisterInfo &MRI,
                                            const TargetInstrInfo *TII,
-                                           const RISCVSubtarget &STI) {
+                                           const RISCVSubtarget &STI,
+                                           const MachineInstr *UseMI) {
   if (!Reg.isVirtual())
     return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
@@ -1761,6 +1767,12 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
        MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU))
     return nullptr;
 
+  if (!STI.hasShortForwardBranchILoad() &&
+      (MI->getOpcode() == RISCV::LB || MI->getOpcode() == RISCV::LBU ||
+       MI->getOpcode() == RISCV::LW || MI->getOpcode() == RISCV::LH ||
+       MI->getOpcode() == RISCV::LHU))
+    return nullptr;
+
   if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL)
     return nullptr;
 
@@ -1788,6 +1800,37 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
+
+  if (MI->getOpcode() == RISCV::LB || MI->getOpcode() == RISCV::LBU ||
+      MI->getOpcode() == RISCV::LW || MI->getOpcode() == RISCV::LH ||
+      MI->getOpcode() == RISCV::LHU) {
+    if (MI && UseMI && MI->getParent() == UseMI->getParent()) {
+      // For the simple case, when both the def and use of Load are in the same
+      // basic block, instructions can be scanned linearly if there are any
+      // stores between def and use.
+      auto &MBB = *MI->getParent();
+      DontMoveAcrossStores = false;
+
+      auto DefIt = MBB.begin();
+      auto UseIt = MBB.begin();
+
+      for (auto It = MBB.begin(); It != MBB.end(); ++It) {
+        if (&*It == MI)
+          DefIt = It;
+        if (&*It == UseMI)
+          UseIt = It;
+      }
+      if (DefIt != MBB.end() && UseIt != MBB.end() && DefIt != UseIt) {
+        for (auto I = std::next(DefIt); I != UseIt; ++I) {
+          if (I->mayStore()) {
+            DontMoveAcrossStores = true;
+            LLVM_DEBUG(dbgs() << "Store found between def and use\n");
+          }
+        }
+      }
+    }
+  }
+
   if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
@@ -1827,10 +1870,11 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI,
 
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI =
-      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI);
+      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI, &MI);
   bool Invert = !DefMI;
   if (!DefMI)
-    DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI);
+    DefMI =
+        canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI, &MI);
   if (!DefMI)
     return nullptr;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index 5b1c13493bbf2..e7fca38cf5dbe 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -80,6 +80,17 @@ class SFBLUI
   let Constraints = "$dst = $falsev";
 }
 
+class SFBLoad
+    : Pseudo<(outs GPR:$dst),
+             (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+                  simm12_lo:$imm), []> {
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+  let Size = 8;
+  let Constraints = "$dst = $falsev";
+}
+
 class SFBShift_ri
     : Pseudo<(outs GPR:$dst),
              (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
@@ -122,6 +133,11 @@ def PseudoCCMIN : SFBALU_rr;
 def PseudoCCMAXU : SFBALU_rr;
 def PseudoCCMINU : SFBALU_rr;
 def PseudoCCMUL : SFBALU_rr;
+def PseudoCCLB : SFBLoad;
+def PseudoCCLH : SFBLoad;
+def PseudoCCLW : SFBLoad;
+def PseudoCCLHU : SFBLoad;
+def PseudoCCLBU : SFBLoad;
 
 def PseudoCCADDI : SFBALU_ri;
 def PseudoCCANDI : SFBALU_ri;
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 3d9906fdcbeb3..9e6be44272821 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -137,6 +137,7 @@
 ; CHECK-NEXT:   shgatpa                          - 'Shgatpa' (SvNNx4 mode supported for all modes supported by satp, as well as Bare).
 ; CHECK-NEXT:   shifted-zextw-fusion             - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension.
 ; CHECK-NEXT:   shlcofideleg                     - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode).
+; CHECK-NEXT:   short-forward-branch-i-load      - Enable short forward branch optimization for load instructions.
 ; CHECK-NEXT:   short-forward-branch-i-minmax    - Enable short forward branch optimization for min,max instructions in Zbb.
 ; CHECK-NEXT:   short-forward-branch-i-mul       - Enable short forward branch optimization for mul instruction.
 ; CHECK-NEXT:   short-forward-branch-opt         - Enable short forward branch optimization.
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
new file mode 100644
index 0000000000000..4f0a0861721f6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
@@ -0,0 +1,755 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
+
+define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i8_s:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lb a2, 4(a0)
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lb a2, 4(a0)
+; RV64I-NEXT:  .LBB0_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i8_z:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lbu a2, 4(a0)
+; RV32I-NEXT:  .LBB1_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lbu a2, 4(a0)
+; RV64I-NEXT:  .LBB1_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i16_s:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lh a2, 8(a0)
+; RV32I-NEXT:  .LBB2_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lh a2, 8(a0)
+; RV64I-NEXT:  .LBB2_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i16_z:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lhu a2, 8(a0)
+; RV32I-NEXT:  .LBB3_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lhu a2, 8(a0)
+; RV64I-NEXT:  .LBB3_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i32:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB4_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw a2, 16(a0)
+; RV32I-NEXT:  .LBB4_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB4_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lw a2, 16(a0)
+; RV64I-NEXT:  .LBB4_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load i32, ptr %addr          ; load 32-bit value
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_s_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lb a2, 4(a0)
+; RV32I-NEXT:    srai a3, a2, 31
+; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB5_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lb a2, 4(a0)
+; RV64I-NEXT:  .LBB5_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB5_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB5_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB5_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB5_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB5_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB5_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_z_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lbu a2, 4(a0)
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lbu a2, 4(a0)
+; RV64I-NEXT:  .LBB6_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB6_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB6_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB6_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB6_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:  .LBB6_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB6_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_s_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lh a2, 8(a0)
+; RV32I-NEXT:    srai a3, a2, 31
+; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lh a2, 8(a0)
+; RV64I-NEXT:  .LBB7_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB7_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB7_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB7_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB7_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB7_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB7_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_z_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lhu a2, 8(a0)
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lhu a2, 8(a0)
+; RV64I-NEXT:  .LBB8_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB8_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB8_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB8_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB8_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB8_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB8_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:  .LBB8_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB8_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i64_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw a2, 32(a0)
+; RV32I-NEXT:    lw a3, 36(a0)
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB9_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    ld a2, 32(a0)
+; RV64I-NEXT:  .LBB9_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a4, 32(a0)
+; RV32I-SFB-NEXT:    lw a5, 36(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a4, a2
+; RV32I-SFB-NEXT:  .LBB9_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a5, a3
+; RV32I-SFB-NEXT:  .LBB9_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, a4
+; RV32I-SFB-NEXT:    mv a1, a5
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB9_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB9_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 32(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB9_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 36(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB9_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load i64, ptr %addr          ; load 64-bit value
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}

>From 713b119a46ae98103eb9f369e95003939bf4593e Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Fri, 21 Nov 2025 16:48:09 +0530
Subject: [PATCH 02/11] fixup! Changes to reuse existing support for Load
 folding

Change-Id: Iba30a2d81f79b0b99bf718252a8ad4c4e331c03c
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 108 ++++++++++++++---------
 llvm/lib/Target/RISCV/RISCVInstrInfo.h   |   5 ++
 2 files changed, 71 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index d6953c24a8955..3a3e6f39c9df4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -888,6 +888,72 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
       .addImm(0);
 }
 
+unsigned getLoadPredicatedOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case RISCV::LB:
+    return RISCV::PseudoCCLB;
+  case RISCV::LBU:
+    return RISCV::PseudoCCLBU;
+  case RISCV::LH:
+    return RISCV::PseudoCCLH;
+  case RISCV::LHU:
+    return RISCV::PseudoCCLHU;
+  case RISCV::LW:
+    return RISCV::PseudoCCLW;
+  default:
+    return 0;
+  }
+}
+
+MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+    LiveIntervals *LIS) const {
+  assert(MI.getOpcode() == RISCV::PseudoCCMOVGPR &&
+         "Unknown select instruction");
+  if (!STI.hasShortForwardBranchILoad() ||
+      (LoadMI.getOpcode() != RISCV::LB && LoadMI.getOpcode() != RISCV::LBU &&
+       LoadMI.getOpcode() != RISCV::LH && LoadMI.getOpcode() != RISCV::LHU &&
+       LoadMI.getOpcode() != RISCV::LW))
+    return nullptr;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool Invert =
+      (MRI.getVRegDef(MI.getOperand(4).getReg()) == &LoadMI) ? true : false;
+  MachineOperand FalseReg = MI.getOperand(Invert ? 5 : 4);
+  Register DestReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+    return nullptr;
+
+  unsigned PredOpc = getLoadPredicatedOpcode(LoadMI.getOpcode());
+  assert(PredOpc != 0 && "Unexpected opcode!");
+
+  // Create a new predicated version of DefMI.
+  MachineInstrBuilder NewMI = BuildMI(*MI.getParent(), InsertPt,
+                                      MI.getDebugLoc(), get(PredOpc), DestReg);
+
+  // Copy the condition portion.
+  NewMI.add(MI.getOperand(1));
+  NewMI.add(MI.getOperand(2));
+
+  // Add condition code, inverting if necessary.
+  auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
+  if (Invert)
+    CC = RISCVCC::getInverseBranchCondition(CC);
+  NewMI.addImm(CC);
+
+  // Copy the false register.
+  NewMI.add(FalseReg);
+
+  // Copy all the DefMI operands.
+  const MCInstrDesc &DefDesc = LoadMI.getDesc();
+  for (unsigned i = 1, e = DefDesc.getNumOperands(); i != e; ++i)
+    NewMI.add(LoadMI.getOperand(i));
+
+  return NewMI;
+}
+
 void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             const DebugLoc &DL, Register DstReg, uint64_t Val,
@@ -1708,11 +1774,6 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
   case RISCV::MINU:  return RISCV::PseudoCCMINU;
   case RISCV::MUL:   return RISCV::PseudoCCMUL;
   case RISCV::LUI:   return RISCV::PseudoCCLUI;
-  case RISCV::LB:    return RISCV::PseudoCCLB;
-  case RISCV::LBU:   return RISCV::PseudoCCLBU;
-  case RISCV::LH:    return RISCV::PseudoCCLH;
-  case RISCV::LHU:   return RISCV::PseudoCCLHU;
-  case RISCV::LW:    return RISCV::PseudoCCLW;
   case RISCV::QC_LI:   return RISCV::PseudoCCQC_LI;
   case RISCV::QC_E_LI:   return RISCV::PseudoCCQC_E_LI;
 
@@ -1767,12 +1828,6 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
        MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU))
     return nullptr;
 
-  if (!STI.hasShortForwardBranchILoad() &&
-      (MI->getOpcode() == RISCV::LB || MI->getOpcode() == RISCV::LBU ||
-       MI->getOpcode() == RISCV::LW || MI->getOpcode() == RISCV::LH ||
-       MI->getOpcode() == RISCV::LHU))
-    return nullptr;
-
   if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL)
     return nullptr;
 
@@ -1800,37 +1855,6 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-
-  if (MI->getOpcode() == RISCV::LB || MI->getOpcode() == RISCV::LBU ||
-      MI->getOpcode() == RISCV::LW || MI->getOpcode() == RISCV::LH ||
-      MI->getOpcode() == RISCV::LHU) {
-    if (MI && UseMI && MI->getParent() == UseMI->getParent()) {
-      // For the simple case, when both the def and use of Load are in the same
-      // basic block, instructions can be scanned linearly if there are any
-      // stores between def and use.
-      auto &MBB = *MI->getParent();
-      DontMoveAcrossStores = false;
-
-      auto DefIt = MBB.begin();
-      auto UseIt = MBB.begin();
-
-      for (auto It = MBB.begin(); It != MBB.end(); ++It) {
-        if (&*It == MI)
-          DefIt = It;
-        if (&*It == UseMI)
-          UseIt = It;
-      }
-      if (DefIt != MBB.end() && UseIt != MBB.end() && DefIt != UseIt) {
-        for (auto I = std::next(DefIt); I != UseIt; ++I) {
-          if (I->mayStore()) {
-            DontMoveAcrossStores = true;
-            LLVM_DEBUG(dbgs() << "Store found between def and use\n");
-          }
-        }
-      }
-    }
-  }
-
   if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0ffe015b9fac8..908da393535fe 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -133,6 +133,11 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
                                       LiveIntervals *LIS = nullptr,
                                       VirtRegMap *VRM = nullptr) const override;
 
+  MachineInstr *foldMemoryOperandImpl(
+      MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+      MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+      LiveIntervals *LIS = nullptr) const override;
+
   // Materializes the given integer Val into DstReg.
   void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
               const DebugLoc &DL, Register DstReg, uint64_t Val,

>From 43e941eacd5b38982a1e48715b3db388b8770c91 Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Fri, 21 Nov 2025 16:52:09 +0530
Subject: [PATCH 03/11] fixup! Remove redundant parameters

Change-Id: Ib3259359f6c1cd5dc0e81503c9895393a48cef88
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 3a3e6f39c9df4..0bd42a2633888 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1813,8 +1813,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
 static MachineInstr *canFoldAsPredicatedOp(Register Reg,
                                            const MachineRegisterInfo &MRI,
                                            const TargetInstrInfo *TII,
-                                           const RISCVSubtarget &STI,
-                                           const MachineInstr *UseMI) {
+                                           const RISCVSubtarget &STI) {
   if (!Reg.isVirtual())
     return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
@@ -1894,11 +1893,10 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI,
 
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI =
-      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI, &MI);
+      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI);
   bool Invert = !DefMI;
   if (!DefMI)
-    DefMI =
-        canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI, &MI);
+    DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI);
   if (!DefMI)
     return nullptr;
 

>From ac5c60117b6bf811fecbcba10eea9928d57a4be2 Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Tue, 25 Nov 2025 18:08:29 +0530
Subject: [PATCH 04/11] fixup! Remove assert and add test cases

Change-Id: I38375cba5c5897a2f3e0a3e8ba7909865e466f74
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |    6 +-
 .../RISCV/short-forward-branch-opt-load.ll    | 1162 ++++++++++++++---
 2 files changed, 1003 insertions(+), 165 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 0bd42a2633888..f5c65410a3eba 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -909,8 +909,10 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
     LiveIntervals *LIS) const {
-  assert(MI.getOpcode() == RISCV::PseudoCCMOVGPR &&
-         "Unknown select instruction");
+  // For now, only handle RISCV::PseudoCCMOVGPR.
+  if (MI.getOpcode() != RISCV::PseudoCCMOVGPR)
+    return nullptr;
+
   if (!STI.hasShortForwardBranchILoad() ||
       (LoadMI.getOpcode() != RISCV::LB && LoadMI.getOpcode() != RISCV::LBU &&
        LoadMI.getOpcode() != RISCV::LH && LoadMI.getOpcode() != RISCV::LHU &&
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
index 4f0a0861721f6..9ed1218cf7fb5 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
@@ -349,407 +349,1243 @@ entry:
   ret i32 %res
 }
 
+define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i8_s_store:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB5_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i8_z_store:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB6_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB6_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i16_s_store:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB7_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB7_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i16_z_store:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB8_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB8_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB8_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i32_store:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_store:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB9_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB9_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_store:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB9_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_store:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB9_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_store:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_store:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load i32, ptr %addr          ; load 32-bit value
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
 define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) {
 ; RV32I-LABEL: test_i8_s_1:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    beqz a1, .LBB5_2
+; RV32I-NEXT:    beqz a1, .LBB10_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lb a2, 4(a0)
+; RV32I-NEXT:    srai a3, a2, 31
+; RV32I-NEXT:  .LBB10_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB10_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lb a2, 4(a0)
+; RV64I-NEXT:  .LBB10_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB10_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB10_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB10_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_z_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lbu a2, 4(a0)
+; RV32I-NEXT:  .LBB11_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB11_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lbu a2, 4(a0)
+; RV64I-NEXT:  .LBB11_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB11_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB11_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB11_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB11_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB11_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB11_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB11_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB11_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_s_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB12_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lh a2, 8(a0)
+; RV32I-NEXT:    srai a3, a2, 31
+; RV32I-NEXT:  .LBB12_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB12_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lh a2, 8(a0)
+; RV64I-NEXT:  .LBB12_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB12_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB12_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB12_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_z_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB13_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lhu a2, 8(a0)
+; RV32I-NEXT:  .LBB13_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB13_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lhu a2, 8(a0)
+; RV64I-NEXT:  .LBB13_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB13_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB13_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB13_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB13_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB13_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB13_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB13_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB13_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i64_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB14_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lb a2, 4(a0)
-; RV32I-NEXT:    srai a3, a2, 31
-; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    lw a2, 32(a0)
+; RV32I-NEXT:    lw a3, 36(a0)
+; RV32I-NEXT:  .LBB14_2: # %entry
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: test_i8_s_1:
+; RV64I-LABEL: test_i64_1:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    beqz a1, .LBB5_2
+; RV64I-NEXT:    beqz a1, .LBB14_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lb a2, 4(a0)
-; RV64I-NEXT:  .LBB5_2: # %entry
+; RV64I-NEXT:    ld a2, 32(a0)
+; RV64I-NEXT:  .LBB14_2: # %entry
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
-; RV32I-SFB-LABEL: test_i8_s_1:
+; RV32I-SFB-LABEL: test_i64_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a4, 32(a0)
+; RV32I-SFB-NEXT:    lw a5, 36(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB14_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a4, a2
+; RV32I-SFB-NEXT:  .LBB14_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB14_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a5, a3
+; RV32I-SFB-NEXT:  .LBB14_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, a4
+; RV32I-SFB-NEXT:    mv a1, a5
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB14_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB14_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB14_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 32(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB14_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 36(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB14_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load i64, ptr %addr          ; load 64-bit value
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i8_s_store_64:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB15_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB15_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_64:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB15_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB15_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lb a0, 4(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB5_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB15_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB5_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB5_4
+; RV32I-SFB-NEXT:  .LBB15_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB15_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB5_4: # %entry
+; RV32I-SFB-NEXT:  .LBB15_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
-; RV64I-SFB-LABEL: test_i8_s_1:
+; RV64I-SFB-LABEL: test_i8_s_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lb a0, 4(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-NEXT:  .LBB15_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
-; RV32I-SFBILOAD-LABEL: test_i8_s_1:
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB5_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB15_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB5_4
+; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB15_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB5_4: # %entry
+; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
-; RV64I-SFBILOAD-LABEL: test_i8_s_1:
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB5_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
   %val = load i8, ptr %addr          ; load 8-bit value
   %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
   %res = select i1 %x, i64 %ext, i64 %b
   ret i64 %res
 }
 
-define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) {
-; RV32I-LABEL: test_i8_z_1:
+define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i8_z_store_64:
 ; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a0, 4(a0)
 ; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    beqz a1, .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lbu a2, 4(a0)
-; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB16_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB16_2: # %entry
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: test_i8_z_1:
+; RV64I-LABEL: test_i8_z_store_64:
 ; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a0, 4(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    beqz a1, .LBB6_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lbu a2, 4(a0)
-; RV64I-NEXT:  .LBB6_2: # %entry
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB16_2
+; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB16_2: # %entry
 ; RV64I-NEXT:    ret
 ;
-; RV32I-SFB-LABEL: test_i8_z_1:
+; RV32I-SFB-LABEL: test_i8_z_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lbu a0, 4(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB6_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB6_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB6_4: # %entry
+; RV32I-SFB-NEXT:  .LBB16_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB16_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB16_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
-; RV64I-SFB-LABEL: test_i8_z_1:
+; RV64I-SFB-LABEL: test_i8_z_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lbu a0, 4(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-NEXT:  .LBB16_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
-; RV32I-SFBILOAD-LABEL: test_i8_z_1:
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB6_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB6_4
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB16_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:  .LBB6_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
-; RV64I-SFBILOAD-LABEL: test_i8_z_1:
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB6_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
   %val = load i8, ptr %addr          ; load 8-bit value
   %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
   %res = select i1 %x, i64 %ext, i64 %b
   ret i64 %res
 }
 
-define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) {
-; RV32I-LABEL: test_i16_s_1:
+define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i16_s_store_64:
 ; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a0, 8(a0)
 ; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    beqz a1, .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lh a2, 8(a0)
-; RV32I-NEXT:    srai a3, a2, 31
-; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB17_2
+; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB17_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: test_i16_s_1:
+; RV64I-LABEL: test_i16_s_store_64:
 ; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a0, 8(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    beqz a1, .LBB7_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lh a2, 8(a0)
-; RV64I-NEXT:  .LBB7_2: # %entry
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB17_2
+; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB17_2: # %entry
 ; RV64I-NEXT:    ret
 ;
-; RV32I-SFB-LABEL: test_i16_s_1:
+; RV32I-SFB-LABEL: test_i16_s_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lh a0, 8(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB7_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB7_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB7_4
+; RV32I-SFB-NEXT:  .LBB17_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB17_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB7_4: # %entry
+; RV32I-SFB-NEXT:  .LBB17_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
-; RV64I-SFB-LABEL: test_i16_s_1:
+; RV64I-SFB-LABEL: test_i16_s_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lh a0, 8(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-NEXT:  .LBB17_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
-; RV32I-SFBILOAD-LABEL: test_i16_s_1:
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB7_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB7_4
+; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB7_4: # %entry
+; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
-; RV64I-SFBILOAD-LABEL: test_i16_s_1:
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB7_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
   %val = load i16, ptr %addr          ; load 16-bit value
   %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
   %res = select i1 %x, i64 %ext, i64 %b
   ret i64 %res
 }
 
-define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) {
-; RV32I-LABEL: test_i16_z_1:
+define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i16_z_store_64:
 ; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a0, 8(a0)
 ; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    beqz a1, .LBB8_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lhu a2, 8(a0)
-; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB18_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB18_2: # %entry
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: test_i16_z_1:
+; RV64I-LABEL: test_i16_z_store_64:
 ; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a0, 8(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    beqz a1, .LBB8_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lhu a2, 8(a0)
-; RV64I-NEXT:  .LBB8_2: # %entry
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB18_2
+; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB18_2: # %entry
 ; RV64I-NEXT:    ret
 ;
-; RV32I-SFB-LABEL: test_i16_z_1:
+; RV32I-SFB-LABEL: test_i16_z_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lhu a0, 8(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB8_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB8_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB8_4: # %entry
+; RV32I-SFB-NEXT:  .LBB18_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB18_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
-; RV64I-SFB-LABEL: test_i16_z_1:
+; RV64I-SFB-LABEL: test_i16_z_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lhu a0, 8(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB8_2: # %entry
+; RV64I-SFB-NEXT:  .LBB18_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
-; RV32I-SFBILOAD-LABEL: test_i16_z_1:
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB8_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB8_4
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB18_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:  .LBB8_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
-; RV64I-SFBILOAD-LABEL: test_i16_z_1:
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB8_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
   %val = load i16, ptr %addr          ; load 16-bit value
   %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
   %res = select i1 %x, i64 %ext, i64 %b
   ret i64 %res
 }
 
-define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) {
-; RV32I-LABEL: test_i64_1:
+define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i64_store_64:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    beqz a1, .LBB9_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lw a2, 32(a0)
-; RV32I-NEXT:    lw a3, 36(a0)
-; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    mv a7, a1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 32(a0)
+; RV32I-NEXT:    lw a1, 36(a1)
+; RV32I-NEXT:    andi a7, a7, 1
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a7, .LBB19_2
+; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:  .LBB19_2: # %entry
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: test_i64_1:
+; RV64I-LABEL: test_i64_store_64:
 ; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    ld a0, 32(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    beqz a1, .LBB9_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    ld a2, 32(a0)
-; RV64I-NEXT:  .LBB9_2: # %entry
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB19_2
+; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB19_2: # %entry
 ; RV64I-NEXT:    ret
 ;
-; RV32I-SFB-LABEL: test_i64_1:
+; RV32I-SFB-LABEL: test_i64_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lw a4, 32(a0)
-; RV32I-SFB-NEXT:    lw a5, 36(a0)
+; RV32I-SFB-NEXT:    lw a7, 32(a0)
+; RV32I-SFB-NEXT:    lw t0, 36(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV32I-SFB-NEXT:    bnez a1, .LBB19_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a4, a2
-; RV32I-SFB-NEXT:  .LBB9_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB9_4
+; RV32I-SFB-NEXT:    mv a7, a2
+; RV32I-SFB-NEXT:  .LBB19_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a5, a3
-; RV32I-SFB-NEXT:  .LBB9_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, a4
-; RV32I-SFB-NEXT:    mv a1, a5
+; RV32I-SFB-NEXT:    mv t0, a3
+; RV32I-SFB-NEXT:  .LBB19_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a7
+; RV32I-SFB-NEXT:    mv a1, t0
 ; RV32I-SFB-NEXT:    ret
 ;
-; RV64I-SFB-LABEL: test_i64_1:
+; RV64I-SFB-LABEL: test_i64_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    ld a0, 32(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB9_2: # %entry
+; RV64I-SFB-NEXT:  .LBB19_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
-; RV32I-SFBILOAD-LABEL: test_i64_1:
+; RV32I-SFBILOAD-LABEL: test_i64_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a7, 32(a0)
+; RV32I-SFBILOAD-NEXT:    lw t0, 36(a0)
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB9_2
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a2, 32(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB9_4
+; RV32I-SFBILOAD-NEXT:    mv a7, a2
+; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a3, 36(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB9_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    mv t0, a3
+; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a7
+; RV32I-SFBILOAD-NEXT:    mv a1, t0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
-; RV64I-SFBILOAD-LABEL: test_i64_1:
+; RV64I-SFBILOAD-LABEL: test_i64_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
   %val = load i64, ptr %addr          ; load 64-bit value
+  store i64 %c, ptr %base1
   %res = select i1 %x, i64 %val, i64 %b
   ret i64 %res
 }
+

>From b6b2619ec3ec8d0f8af7d9aab92280c346f193c6 Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Thu, 4 Dec 2025 14:20:44 +0530
Subject: [PATCH 05/11] fixup! Add support for LD and LWU and other test cases

Change-Id: Ie893377a347c6be3f39868cced6c099950b17bdd
---
 .../Target/RISCV/RISCVExpandPseudoInsts.cpp   |    7 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |    7 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td    |    2 +
 ...-forward-branch-opt-load-atomic-acquire.ll | 5379 +++++++++++++++++
 ...orward-branch-opt-load-atomic-monotonic.ll | 5379 +++++++++++++++++
 ...-forward-branch-opt-load-atomic-seq_cst.ll | 5379 +++++++++++++++++
 .../short-forward-branch-opt-load-volatile.ll | 1022 ++++
 .../RISCV/short-forward-branch-opt-load.ll    |  364 +-
 8 files changed, 17440 insertions(+), 99 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
 create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
 create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
 create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll

diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 04394c37b6bf6..a18aad25ae745 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -138,6 +138,8 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoCCLW:
   case RISCV::PseudoCCLHU:
   case RISCV::PseudoCCLBU:
+  case RISCV::PseudoCCLWU:
+  case RISCV::PseudoCCLD:
   case RISCV::PseudoCCQC_LI:
   case RISCV::PseudoCCQC_E_LI:
   case RISCV::PseudoCCADDW:
@@ -253,6 +255,8 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     case RISCV::PseudoCCLW:    NewOpc = RISCV::LW;    break;
     case RISCV::PseudoCCLHU:   NewOpc = RISCV::LHU;   break;
     case RISCV::PseudoCCLBU:   NewOpc = RISCV::LBU;   break;
+    case RISCV::PseudoCCLWU:   NewOpc = RISCV::LWU;   break;
+    case RISCV::PseudoCCLD:    NewOpc = RISCV::LD;    break;
     case RISCV::PseudoCCQC_LI:  NewOpc = RISCV::QC_LI;   break;
     case RISCV::PseudoCCQC_E_LI: NewOpc = RISCV::QC_E_LI;   break;
     case RISCV::PseudoCCADDI:  NewOpc = RISCV::ADDI;  break;
@@ -290,7 +294,8 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     } else {
       BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg)
           .add(MI.getOperand(5))
-          .add(MI.getOperand(6));
+          .add(MI.getOperand(6))
+          .cloneMemRefs(MI);
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index f5c65410a3eba..b1c2e7e991c3e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -900,6 +900,10 @@ unsigned getLoadPredicatedOpcode(unsigned Opcode) {
     return RISCV::PseudoCCLHU;
   case RISCV::LW:
     return RISCV::PseudoCCLW;
+  case RISCV::LWU:
+    return RISCV::PseudoCCLWU;
+  case RISCV::LD:
+    return RISCV::PseudoCCLD;
   default:
     return 0;
   }
@@ -916,7 +920,8 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
   if (!STI.hasShortForwardBranchILoad() ||
       (LoadMI.getOpcode() != RISCV::LB && LoadMI.getOpcode() != RISCV::LBU &&
        LoadMI.getOpcode() != RISCV::LH && LoadMI.getOpcode() != RISCV::LHU &&
-       LoadMI.getOpcode() != RISCV::LW))
+       LoadMI.getOpcode() != RISCV::LW && LoadMI.getOpcode() != RISCV::LWU &&
+       LoadMI.getOpcode() != RISCV::LD))
     return nullptr;
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index e7fca38cf5dbe..bcb81d14ed36f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -138,6 +138,8 @@ def PseudoCCLH : SFBLoad;
 def PseudoCCLW : SFBLoad;
 def PseudoCCLHU : SFBLoad;
 def PseudoCCLBU : SFBLoad;
+def PseudoCCLWU : SFBLoad;
+def PseudoCCLD : SFBLoad;
 
 def PseudoCCADDI : SFBALU_ri;
 def PseudoCCANDI : SFBALU_ri;
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
new file mode 100644
index 0000000000000..51f2643c94191
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
@@ -0,0 +1,5379 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
+
+define i32 @test_i8_s_3(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i8_s_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s1, .LBB0_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB0_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB0_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 24
+; RV32I-SFB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_3(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i8_z_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s1, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:  .LBB1_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB1_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB1_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    zext.b s0, a0
+; RV32I-SFB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_3(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i16_s_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s1, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:  .LBB2_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB2_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB2_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 16
+; RV32I-SFB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_3(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i16_z_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s1, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:  .LBB3_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB3_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB3_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srli s0, a0, 16
+; RV32I-SFB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s0, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_3(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i32_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    bnez s1, .LBB4_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:  .LBB4_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 16
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    bnez s1, .LBB4_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB4_2: # %entry
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 16
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_4
+; RV32I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 16
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_4
+; RV64I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV32I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr acquire, align 4          ; load 32-bit value
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_s_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i8_s_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB5_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB5_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB5_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s2, a0, 24
+; RV32I-SFB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s2, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i8_z_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB6_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    zext.b s2, a0
+; RV32I-SFB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i16_s_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB7_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s2, a0, 16
+; RV32I-SFB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s2, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i16_z_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB8_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB8_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB8_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srli s2, a0, 16
+; RV32I-SFB-NEXT:  .LBB8_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB8_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s2, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i32_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    bnez s3, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 16
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB9_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB9_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 16
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_4
+; RV32I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:  .LBB9_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 16
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_4
+; RV64I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB9_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV32I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr acquire, align 4          ; load 32-bit value
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i64 @test_i8_s_1_3(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_s_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s2, .LBB10_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 24
+; RV32I-NEXT:  .LBB10_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB10_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB10_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s1, a0, 24
+; RV32I-SFB-NEXT:  .LBB10_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:  .LBB10_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:  .LBB10_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s1, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_1_3(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_z_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s2, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:  .LBB11_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB11_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB11_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB11_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    zext.b s1, a0
+; RV32I-SFB-NEXT:  .LBB11_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:  .LBB11_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s1, a0
+; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_1_3(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_s_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s2, .LBB12_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 16
+; RV32I-NEXT:  .LBB12_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB12_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB12_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s1, a0, 16
+; RV32I-SFB-NEXT:  .LBB12_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:  .LBB12_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB12_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s1, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_1_3(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_z_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s2, .LBB13_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:  .LBB13_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB13_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB13_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB13_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srli s1, a0, 16
+; RV32I-SFB-NEXT:  .LBB13_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB13_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s1, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i32_z_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    addi a2, sp, 12
+; RV32I-NEXT:    li a3, 2
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    beqz s2, .LBB14_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:  .LBB14_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 2
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    beqz s1, .LBB14_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:  .LBB14_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    addi a2, sp, 12
+; RV32I-SFB-NEXT:    li a3, 2
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 12(sp)
+; RV32I-SFB-NEXT:    bnez s2, .LBB14_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB14_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB14_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 2
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s1, .LBB14_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB14_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
+; RV32I-SFBILOAD-NEXT:    li a3, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB14_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr acquire, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_1_3(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i64_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    bnez s2, .LBB15_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB15_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 32
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    bnez s1, .LBB15_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB15_2: # %entry
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    bnez s2, .LBB15_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB15_2: # %entry
+; RV32I-SFB-NEXT:    bnez s2, .LBB15_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:  .LBB15_4: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 32
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_8
+; RV64I-SFB-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB15_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr acquire, align 8          ; load 64-bit value
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_s_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i8_s_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB16_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 24
+; RV32I-NEXT:  .LBB16_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB16_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB16_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s4, a0, 24
+; RV32I-SFB-NEXT:  .LBB16_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:  .LBB16_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:  .LBB16_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s4, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i8_z_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB17_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:  .LBB17_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB17_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB17_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB17_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    zext.b s4, a0
+; RV32I-SFB-NEXT:  .LBB17_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:  .LBB17_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s4, a0
+; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i16_s_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB18_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 16
+; RV32I-NEXT:  .LBB18_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB18_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB18_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s4, a0, 16
+; RV32I-SFB-NEXT:  .LBB18_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB18_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s4, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i16_z_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB19_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:  .LBB19_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB19_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB19_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB19_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srli s4, a0, 16
+; RV32I-SFB-NEXT:  .LBB19_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB19_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s4, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i32_z_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    mv a2, sp
+; RV32I-NEXT:    li a3, 2
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB20_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:  .LBB20_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 2
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    lwu a0, 4(sp)
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB20_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB20_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    mv a2, sp
+; RV32I-SFB-NEXT:    li a3, 2
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 0(sp)
+; RV32I-SFB-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB20_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB20_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 2
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB20_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    mv a2, sp
+; RV32I-SFBILOAD-NEXT:    li a3, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr acquire, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i64_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB21_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB21_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 32
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB21_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB21_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    bnez s5, .LBB21_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB21_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB21_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:  .LBB21_4: # %entry
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 32
+; RV64I-SFB-NEXT:    li a1, 2
+; RV64I-SFB-NEXT:    call __atomic_load_8
+; RV64I-SFB-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB21_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:  .LBB21_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV64I-SFBILOAD-NEXT:    li a1, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr acquire, align 8          ; load 64-bit value
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
new file mode 100644
index 0000000000000..c2564e6ac654f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
@@ -0,0 +1,5379 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
+
+define i32 @test_i8_s_2(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i8_s_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s1, .LBB0_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB0_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB0_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 24
+; RV32I-SFB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_2(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i8_z_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s1, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:  .LBB1_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB1_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB1_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    zext.b s0, a0
+; RV32I-SFB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_2(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i16_s_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s1, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:  .LBB2_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB2_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB2_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 16
+; RV32I-SFB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_2(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i16_z_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s1, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:  .LBB3_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB3_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB3_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srli s0, a0, 16
+; RV32I-SFB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s0, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_2(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i32_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    bnez s1, .LBB4_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:  .LBB4_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 16
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    bnez s1, .LBB4_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB4_2: # %entry
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 16
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_4
+; RV32I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 16
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_4
+; RV64I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV32I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr monotonic, align 4          ; load 32-bit value
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_s_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i8_s_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB5_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB5_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB5_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s2, a0, 24
+; RV32I-SFB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s2, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i8_z_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB6_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    zext.b s2, a0
+; RV32I-SFB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i16_s_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB7_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s2, a0, 16
+; RV32I-SFB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s2, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i16_z_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB8_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB8_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB8_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srli s2, a0, 16
+; RV32I-SFB-NEXT:  .LBB8_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB8_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s2, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i32_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    bnez s3, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 16
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB9_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB9_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 16
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_4
+; RV32I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:  .LBB9_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 16
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_4
+; RV64I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB9_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV32I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr monotonic, align 4          ; load 32-bit value
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i64 @test_i8_s_1_2(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_s_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s2, .LBB10_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 24
+; RV32I-NEXT:  .LBB10_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB10_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB10_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s1, a0, 24
+; RV32I-SFB-NEXT:  .LBB10_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:  .LBB10_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:  .LBB10_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s1, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_1_2(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_z_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s2, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:  .LBB11_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB11_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB11_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB11_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    zext.b s1, a0
+; RV32I-SFB-NEXT:  .LBB11_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:  .LBB11_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s1, a0
+; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_1_2(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_s_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s2, .LBB12_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 16
+; RV32I-NEXT:  .LBB12_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB12_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB12_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s1, a0, 16
+; RV32I-SFB-NEXT:  .LBB12_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:  .LBB12_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB12_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s1, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_1_2(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_z_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s2, .LBB13_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:  .LBB13_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB13_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB13_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB13_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srli s1, a0, 16
+; RV32I-SFB-NEXT:  .LBB13_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB13_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s1, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i32_z_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    addi a2, sp, 12
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    beqz s2, .LBB14_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:  .LBB14_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 0
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    beqz s1, .LBB14_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:  .LBB14_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    addi a2, sp, 12
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 12(sp)
+; RV32I-SFB-NEXT:    bnez s2, .LBB14_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB14_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB14_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 0
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s1, .LBB14_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB14_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB14_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr monotonic, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_1_2(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i64_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    bnez s2, .LBB15_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB15_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 32
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    bnez s1, .LBB15_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB15_2: # %entry
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    bnez s2, .LBB15_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB15_2: # %entry
+; RV32I-SFB-NEXT:    bnez s2, .LBB15_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:  .LBB15_4: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 32
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_8
+; RV64I-SFB-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB15_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr monotonic, align 8          ; load 64-bit value
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_s_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i8_s_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB16_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 24
+; RV32I-NEXT:  .LBB16_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB16_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB16_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s4, a0, 24
+; RV32I-SFB-NEXT:  .LBB16_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:  .LBB16_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:  .LBB16_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s4, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i8_z_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB17_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:  .LBB17_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB17_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB17_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB17_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    zext.b s4, a0
+; RV32I-SFB-NEXT:  .LBB17_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:  .LBB17_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s4, a0
+; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i16_s_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB18_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 16
+; RV32I-NEXT:  .LBB18_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB18_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB18_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s4, a0, 16
+; RV32I-SFB-NEXT:  .LBB18_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB18_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s4, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i16_z_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB19_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:  .LBB19_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB19_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB19_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB19_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srli s4, a0, 16
+; RV32I-SFB-NEXT:  .LBB19_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB19_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s4, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i32_z_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    mv a2, sp
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB20_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:  .LBB20_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 0
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    lwu a0, 4(sp)
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB20_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB20_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    mv a2, sp
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 0(sp)
+; RV32I-SFB-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB20_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB20_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 0
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB20_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    mv a2, sp
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr monotonic, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i64_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB21_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB21_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 32
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB21_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB21_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    bnez s5, .LBB21_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB21_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB21_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:  .LBB21_4: # %entry
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 32
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_8
+; RV64I-SFB-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB21_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:  .LBB21_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr monotonic, align 8          ; load 64-bit value
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
new file mode 100644
index 0000000000000..9308fa38d95bd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
@@ -0,0 +1,5379 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
+
+define i32 @test_i8_s_4(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i8_s_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s1, .LBB0_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB0_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB0_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 24
+; RV32I-SFB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_4(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i8_z_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s1, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:  .LBB1_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB1_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB1_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    zext.b s0, a0
+; RV32I-SFB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_4(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i16_s_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s1, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:  .LBB2_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB2_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB2_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 16
+; RV32I-SFB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_4(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i16_z_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s1, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:  .LBB3_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB3_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB3_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srli s0, a0, 16
+; RV32I-SFB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s0, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_4(ptr %base, i1 %x, i32 %b) {
+; RV32I-LABEL: test_i32_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s1, a1, 1
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    bnez s1, .LBB4_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:  .LBB4_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 16
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    bnez s1, .LBB4_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB4_2: # %entry
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    andi s1, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 16
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_4
+; RV32I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 16
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_4
+; RV64I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV32I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr seq_cst, align 4          ; load 32-bit value
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_s_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i8_s_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB5_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB5_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB5_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s2, a0, 24
+; RV32I-SFB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s2, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i8_z_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB6_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    zext.b s2, a0
+; RV32I-SFB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i16_s_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB7_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s2, a0, 16
+; RV32I-SFB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s2, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i16_z_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB8_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB8_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB8_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srli s2, a0, 16
+; RV32I-SFB-NEXT:  .LBB8_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB8_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s2, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+; RV32I-LABEL: test_i32_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    andi s3, a1, 1
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    bnez s3, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 16
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB9_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB9_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    andi s3, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 16
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_4
+; RV32I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:  .LBB9_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 16
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_4
+; RV64I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB9_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV32I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr seq_cst, align 4          ; load 32-bit value
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i64 @test_i8_s_1_4(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_s_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s2, .LBB10_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 24
+; RV32I-NEXT:  .LBB10_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB10_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB10_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s1, a0, 24
+; RV32I-SFB-NEXT:  .LBB10_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:  .LBB10_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:  .LBB10_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s1, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_1_4(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i8_z_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s2, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:  .LBB11_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB11_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB11_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB11_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    zext.b s1, a0
+; RV32I-SFB-NEXT:  .LBB11_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:  .LBB11_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s1, a0
+; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_1_4(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_s_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s2, .LBB12_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 16
+; RV32I-NEXT:  .LBB12_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB12_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB12_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s1, a0, 16
+; RV32I-SFB-NEXT:  .LBB12_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:  .LBB12_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB12_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s1, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_1_4(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i16_z_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s2, .LBB13_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:  .LBB13_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB13_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB13_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB13_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srli s1, a0, 16
+; RV32I-SFB-NEXT:  .LBB13_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB13_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s1, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i32_z_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    addi a2, sp, 12
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    beqz s2, .LBB14_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:  .LBB14_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    beqz s1, .LBB14_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:  .LBB14_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    addi a2, sp, 12
+; RV32I-SFB-NEXT:    li a3, 5
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 12(sp)
+; RV32I-SFB-NEXT:    bnez s2, .LBB14_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB14_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB14_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 5
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s1, .LBB14_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB14_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
+; RV32I-SFBILOAD-NEXT:    li a3, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB14_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr seq_cst, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_1_4(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i64_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s2, a1, 1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    bnez s2, .LBB15_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB15_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s1, a1, 1
+; RV64I-NEXT:    addi a0, a0, 32
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    bnez s1, .LBB15_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB15_2: # %entry
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    andi s2, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    bnez s2, .LBB15_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB15_2: # %entry
+; RV32I-SFB-NEXT:    bnez s2, .LBB15_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:  .LBB15_4: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    andi s1, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 32
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_8
+; RV64I-SFB-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB15_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr seq_cst, align 8          ; load 64-bit value
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_s_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i8_s_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB16_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 24
+; RV32I-NEXT:  .LBB16_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB16_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB16_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s4, a0, 24
+; RV32I-SFB-NEXT:  .LBB16_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:  .LBB16_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:  .LBB16_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s4, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i8_z_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB17_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:  .LBB17_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB17_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB17_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB17_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    zext.b s4, a0
+; RV32I-SFB-NEXT:  .LBB17_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:  .LBB17_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s4, a0
+; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i16_s_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB18_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 16
+; RV32I-NEXT:  .LBB18_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB18_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB18_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s4, a0, 16
+; RV32I-SFB-NEXT:  .LBB18_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB18_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s4, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i16_z_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB19_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:  .LBB19_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB19_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB19_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB19_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srli s4, a0, 16
+; RV32I-SFB-NEXT:  .LBB19_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB19_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s4, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i32_z_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    mv a2, sp
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB20_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:  .LBB20_2: # %entry
+; RV32I-NEXT:    addi a1, s5, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    lwu a0, 4(sp)
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB20_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB20_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    mv a2, sp
+; RV32I-SFB-NEXT:    li a3, 5
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 0(sp)
+; RV32I-SFB-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB20_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB20_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 5
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB20_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    mv a2, sp
+; RV32I-SFBILOAD-NEXT:    li a3, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr seq_cst, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i64_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    andi s5, a1, 1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB21_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB21_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    andi s3, a1, 1
+; RV64I-NEXT:    addi a0, a0, 32
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB21_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB21_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    .cfi_restore s2
+; RV64I-NEXT:    .cfi_restore s3
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    .cfi_offset ra, -4
+; RV32I-SFB-NEXT:    .cfi_offset s0, -8
+; RV32I-SFB-NEXT:    .cfi_offset s1, -12
+; RV32I-SFB-NEXT:    .cfi_offset s2, -16
+; RV32I-SFB-NEXT:    .cfi_offset s3, -20
+; RV32I-SFB-NEXT:    .cfi_offset s4, -24
+; RV32I-SFB-NEXT:    .cfi_offset s5, -28
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    andi s5, a1, 1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    bnez s5, .LBB21_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB21_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB21_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:  .LBB21_4: # %entry
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    .cfi_restore ra
+; RV32I-SFB-NEXT:    .cfi_restore s0
+; RV32I-SFB-NEXT:    .cfi_restore s1
+; RV32I-SFB-NEXT:    .cfi_restore s2
+; RV32I-SFB-NEXT:    .cfi_restore s3
+; RV32I-SFB-NEXT:    .cfi_restore s4
+; RV32I-SFB-NEXT:    .cfi_restore s5
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    .cfi_offset ra, -8
+; RV64I-SFB-NEXT:    .cfi_offset s0, -16
+; RV64I-SFB-NEXT:    .cfi_offset s1, -24
+; RV64I-SFB-NEXT:    .cfi_offset s2, -32
+; RV64I-SFB-NEXT:    .cfi_offset s3, -40
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    andi s3, a1, 1
+; RV64I-SFB-NEXT:    addi a0, a0, 32
+; RV64I-SFB-NEXT:    li a1, 5
+; RV64I-SFB-NEXT:    call __atomic_load_8
+; RV64I-SFB-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB21_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    .cfi_restore ra
+; RV64I-SFB-NEXT:    .cfi_restore s0
+; RV64I-SFB-NEXT:    .cfi_restore s1
+; RV64I-SFB-NEXT:    .cfi_restore s2
+; RV64I-SFB-NEXT:    .cfi_restore s3
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
+; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:  .LBB21_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
+; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
+; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV64I-SFBILOAD-NEXT:    li a1, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
+; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr seq_cst, align 8          ; load 64-bit value
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
new file mode 100644
index 0000000000000..ebdf25c66fd77
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
@@ -0,0 +1,1022 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
+
+define i32 @test_i8_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+; RV32I-LABEL: test_i8_s_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a4, 4(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a4, 4(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB0_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+define i32 @test_i8_z_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+; RV32I-LABEL: test_i8_z_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB1_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB1_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+define i32 @test_i16_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+; RV32I-LABEL: test_i16_s_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a4, 8(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB2_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a4, 8(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB2_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+define i32 @test_i16_z_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+; RV32I-LABEL: test_i16_z_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a4, 8(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB3_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a4, 8(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB3_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+define i32 @test_i32_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+; RV32I-LABEL: test_i32_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a4, 16(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB4_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB4_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lw a4, 16(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB4_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB4_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load i32, ptr %addr          ; load 32-bit value
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+
+define i64 @test_i8_s_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+; RV32I-LABEL: test_i8_s_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a6, 4(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:    j .LBB5_3
+; RV32I-NEXT:  .LBB5_2:
+; RV32I-NEXT:    srai a3, a6, 31
+; RV32I-NEXT:  .LBB5_3: # %entry
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    sltu a1, a0, a6
+; RV32I-NEXT:    add a3, a3, a5
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a4, 4(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB5_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    lw a5, 4(a4)
+; RV32I-SFB-NEXT:    lw a4, 0(a4)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB5_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB5_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a2, a4
+; RV32I-SFB-NEXT:    sltu a1, a0, a2
+; RV32I-SFB-NEXT:    add a3, a3, a5
+; RV32I-SFB-NEXT:    add a1, a3, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB5_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB5_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB5_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
+; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB5_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i8_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+; RV32I-LABEL: test_i8_z_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a5
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB6_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB6_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a5, 4(a0)
+; RV32I-SFB-NEXT:    lw a6, 4(a4)
+; RV32I-SFB-NEXT:    lw a0, 0(a4)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a5, a2
+; RV32I-SFB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a5, a0
+; RV32I-SFB-NEXT:    sltu a2, a0, a5
+; RV32I-SFB-NEXT:    bnez a1, .LBB6_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    add a6, a6, a3
+; RV32I-SFB-NEXT:  .LBB6_4: # %entry
+; RV32I-SFB-NEXT:    add a1, a6, a2
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB6_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a2, a0, a2
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
+; RV32I-SFBILOAD-NEXT:  .LBB6_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a1, a5, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB6_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i16_s_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+; RV32I-LABEL: test_i16_s_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a6, 8(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    srai a3, a6, 31
+; RV32I-NEXT:  .LBB7_3: # %entry
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    sltu a1, a0, a6
+; RV32I-NEXT:    add a3, a3, a5
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a4, 8(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB7_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB7_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    lw a5, 4(a4)
+; RV32I-SFB-NEXT:    lw a4, 0(a4)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB7_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB7_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a2, a4
+; RV32I-SFB-NEXT:    sltu a1, a0, a2
+; RV32I-SFB-NEXT:    add a3, a3, a5
+; RV32I-SFB-NEXT:    add a1, a3, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB7_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB7_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB7_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
+; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB7_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i16_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+; RV32I-LABEL: test_i16_z_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a6, 8(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a5
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a4, 8(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB8_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a5, 8(a0)
+; RV32I-SFB-NEXT:    lw a6, 4(a4)
+; RV32I-SFB-NEXT:    lw a0, 0(a4)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a5, a2
+; RV32I-SFB-NEXT:  .LBB8_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a5, a0
+; RV32I-SFB-NEXT:    sltu a2, a0, a5
+; RV32I-SFB-NEXT:    bnez a1, .LBB8_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    add a6, a6, a3
+; RV32I-SFB-NEXT:  .LBB8_4: # %entry
+; RV32I-SFB-NEXT:    add a1, a6, a2
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB8_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB8_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a2, a0, a2
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
+; RV32I-SFBILOAD-NEXT:  .LBB8_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a1, a5, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB8_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i32_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+; RV32I-LABEL: test_i32_z_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a6, 16(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a5
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lwu a4, 16(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB9_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB9_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a5, 16(a0)
+; RV32I-SFB-NEXT:    lw a6, 4(a4)
+; RV32I-SFB-NEXT:    lw a0, 0(a4)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a5, a2
+; RV32I-SFB-NEXT:  .LBB9_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a5, a0
+; RV32I-SFB-NEXT:    sltu a2, a0, a5
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    add a6, a6, a3
+; RV32I-SFB-NEXT:  .LBB9_4: # %entry
+; RV32I-SFB-NEXT:    add a1, a6, a2
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB9_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB9_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a2, a0, a2
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
+; RV32I-SFBILOAD-NEXT:  .LBB9_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a1, a5, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB9_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu a2, 16(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load i32, ptr %addr          ; load 32-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i64_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+; RV32I-LABEL: test_i64_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a7, 32(a0)
+; RV32I-NEXT:    lw a6, 36(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a3
+; RV32I-NEXT:    mv a7, a2
+; RV32I-NEXT:  .LBB10_2: # %entry
+; RV32I-NEXT:    add a0, a7, a0
+; RV32I-NEXT:    sltu a1, a0, a7
+; RV32I-NEXT:    add a5, a6, a5
+; RV32I-NEXT:    add a1, a5, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    ld a4, 32(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB10_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB10_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a5, 32(a0)
+; RV32I-SFB-NEXT:    lw a6, 36(a0)
+; RV32I-SFB-NEXT:    lw a7, 4(a4)
+; RV32I-SFB-NEXT:    lw a0, 0(a4)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB10_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a6, a3
+; RV32I-SFB-NEXT:  .LBB10_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB10_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a5, a2
+; RV32I-SFB-NEXT:  .LBB10_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a5, a0
+; RV32I-SFB-NEXT:    sltu a1, a0, a5
+; RV32I-SFB-NEXT:    add a6, a6, a7
+; RV32I-SFB-NEXT:    add a1, a6, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB10_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 32(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 36(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
+; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a2, 32(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load i64, ptr %addr          ; load 64-bit value
+  %val1 = load volatile i64, ptr %base1
+  %res = select i1 %x, i64 %val, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
index 9ed1218cf7fb5..5fc3433458d50 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
@@ -1047,15 +1047,95 @@ entry:
   ret i64 %res
 }
 
+define i64 @test_i32_z_1(ptr %base, i1 %x, i64 %b) {
+; RV32I-LABEL: test_i32_z_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    beqz a1, .LBB14_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw a2, 16(a0)
+; RV32I-NEXT:  .LBB14_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    beqz a1, .LBB14_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lwu a2, 16(a0)
+; RV64I-NEXT:  .LBB14_2: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB14_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB14_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB14_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB14_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB14_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB14_2: # %entry
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB14_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB14_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB14_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu a2, 16(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load i32, ptr %addr          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
 define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) {
 ; RV32I-LABEL: test_i64_1:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    beqz a1, .LBB14_2
+; RV32I-NEXT:    beqz a1, .LBB15_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lw a2, 32(a0)
 ; RV32I-NEXT:    lw a3, 36(a0)
-; RV32I-NEXT:  .LBB14_2: # %entry
+; RV32I-NEXT:  .LBB15_2: # %entry
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
@@ -1063,10 +1143,10 @@ define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) {
 ; RV64I-LABEL: test_i64_1:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    beqz a1, .LBB14_2
+; RV64I-NEXT:    beqz a1, .LBB15_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    ld a2, 32(a0)
-; RV64I-NEXT:  .LBB14_2: # %entry
+; RV64I-NEXT:  .LBB15_2: # %entry
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
@@ -1075,14 +1155,14 @@ define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFB-NEXT:    lw a4, 32(a0)
 ; RV32I-SFB-NEXT:    lw a5, 36(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB14_2
+; RV32I-SFB-NEXT:    bnez a1, .LBB15_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a4, a2
-; RV32I-SFB-NEXT:  .LBB14_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB14_4
+; RV32I-SFB-NEXT:  .LBB15_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB15_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    mv a5, a3
-; RV32I-SFB-NEXT:  .LBB14_4: # %entry
+; RV32I-SFB-NEXT:  .LBB15_4: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a4
 ; RV32I-SFB-NEXT:    mv a1, a5
 ; RV32I-SFB-NEXT:    ret
@@ -1091,35 +1171,35 @@ define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) {
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    ld a0, 32(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB14_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB14_2: # %entry
+; RV64I-SFB-NEXT:  .LBB15_2: # %entry
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_1:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB14_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB15_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lw a2, 32(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB14_4
+; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB15_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    lw a3, 36(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
+; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_1:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB14_2
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB15_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a2, 32(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
@@ -1135,12 +1215,12 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB15_2
+; RV32I-NEXT:    bnez a1, .LBB16_2
 ; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB15_2:
+; RV32I-NEXT:  .LBB16_2:
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
@@ -1149,24 +1229,24 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-NEXT:    lb a0, 4(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB15_2
+; RV64I-NEXT:    bnez a1, .LBB16_2
 ; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB15_2: # %entry
+; RV64I-NEXT:  .LBB16_2: # %entry
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lb a0, 4(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB15_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB15_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB15_4
+; RV32I-SFB-NEXT:  .LBB16_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB15_4: # %entry
+; RV32I-SFB-NEXT:  .LBB16_4: # %entry
 ; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -1177,10 +1257,10 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lb a0, 4(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB15_2: # %entry
+; RV64I-SFB-NEXT:  .LBB16_2: # %entry
 ; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
@@ -1188,14 +1268,14 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB15_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB15_4
+; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
+; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
@@ -1206,10 +1286,10 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
@@ -1228,10 +1308,10 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB16_2
+; RV32I-NEXT:    bnez a1, .LBB17_2
 ; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB16_2: # %entry
+; RV32I-NEXT:  .LBB17_2: # %entry
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
@@ -1241,24 +1321,24 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-NEXT:    lbu a0, 4(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB16_2
+; RV64I-NEXT:    bnez a1, .LBB17_2
 ; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB16_2: # %entry
+; RV64I-NEXT:  .LBB17_2: # %entry
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lbu a0, 4(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB16_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB16_4
+; RV32I-SFB-NEXT:  .LBB17_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB17_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB16_4: # %entry
+; RV32I-SFB-NEXT:  .LBB17_4: # %entry
 ; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a1, a3
@@ -1268,10 +1348,10 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lbu a0, 4(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB16_2: # %entry
+; RV64I-SFB-NEXT:  .LBB17_2: # %entry
 ; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
@@ -1279,14 +1359,14 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB16_4
+; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB17_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
+; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a1, a3
@@ -1296,10 +1376,10 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
@@ -1318,12 +1398,12 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB17_2
+; RV32I-NEXT:    bnez a1, .LBB18_2
 ; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB17_2:
+; RV32I-NEXT:  .LBB18_2:
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
@@ -1332,24 +1412,24 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-NEXT:    lh a0, 8(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB17_2
+; RV64I-NEXT:    bnez a1, .LBB18_2
 ; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB17_2: # %entry
+; RV64I-NEXT:  .LBB18_2: # %entry
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lh a0, 8(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB17_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB17_4
+; RV32I-SFB-NEXT:  .LBB18_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB17_4: # %entry
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
 ; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -1360,10 +1440,10 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lh a0, 8(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB17_2: # %entry
+; RV64I-SFB-NEXT:  .LBB18_2: # %entry
 ; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
@@ -1371,14 +1451,14 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_4
+; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
+; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
@@ -1389,10 +1469,10 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
@@ -1411,10 +1491,10 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB18_2
+; RV32I-NEXT:    bnez a1, .LBB19_2
 ; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB18_2: # %entry
+; RV32I-NEXT:  .LBB19_2: # %entry
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
@@ -1424,24 +1504,24 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-NEXT:    lhu a0, 8(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB18_2
+; RV64I-NEXT:    bnez a1, .LBB19_2
 ; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB18_2: # %entry
+; RV64I-NEXT:  .LBB19_2: # %entry
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lhu a0, 8(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB18_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB18_4
+; RV32I-SFB-NEXT:  .LBB19_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:  .LBB19_4: # %entry
 ; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a1, a3
@@ -1451,10 +1531,10 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lhu a0, 8(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB18_2: # %entry
+; RV64I-SFB-NEXT:  .LBB19_2: # %entry
 ; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
@@ -1462,14 +1542,14 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB18_4
+; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
+; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a1, a3
@@ -1479,10 +1559,10 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
@@ -1494,6 +1574,96 @@ entry:
   ret i64 %res
 }
 
+define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+; RV32I-LABEL: test_i32_z_store_64:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB20_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB20_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_store_64:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lwu a0, 16(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB20_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB20_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_store_64:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB20_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB20_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB20_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB20_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_store_64:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB20_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB20_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_store_64:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB20_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB20_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_store_64:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB20_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load i32, ptr %addr          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
 define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-LABEL: test_i64_store_64:
 ; RV32I:       # %bb.0: # %entry
@@ -1504,11 +1674,11 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    andi a7, a7, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a7, .LBB19_2
+; RV32I-NEXT:    bnez a7, .LBB21_2
 ; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:  .LBB19_2: # %entry
+; RV32I-NEXT:  .LBB21_2: # %entry
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i64_store_64:
@@ -1516,10 +1686,10 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-NEXT:    ld a0, 32(a0)
 ; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB19_2
+; RV64I-NEXT:    bnez a1, .LBB21_2
 ; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB19_2: # %entry
+; RV64I-NEXT:  .LBB21_2: # %entry
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_store_64:
@@ -1527,14 +1697,14 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFB-NEXT:    lw a7, 32(a0)
 ; RV32I-SFB-NEXT:    lw t0, 36(a0)
 ; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB19_2
+; RV32I-SFB-NEXT:    bnez a1, .LBB21_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a7, a2
-; RV32I-SFB-NEXT:  .LBB19_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
+; RV32I-SFB-NEXT:  .LBB21_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB21_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    mv t0, a3
-; RV32I-SFB-NEXT:  .LBB19_4: # %entry
+; RV32I-SFB-NEXT:  .LBB21_4: # %entry
 ; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a0, a7
@@ -1545,10 +1715,10 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    ld a0, 32(a0)
 ; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
+; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB19_2: # %entry
+; RV64I-SFB-NEXT:  .LBB21_2: # %entry
 ; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
@@ -1557,14 +1727,14 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD-NEXT:    lw a7, 32(a0)
 ; RV32I-SFBILOAD-NEXT:    lw t0, 36(a0)
 ; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a7, a2
-; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
+; RV32I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB21_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv t0, a3
-; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
+; RV32I-SFBILOAD-NEXT:  .LBB21_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a0, a7
@@ -1575,10 +1745,10 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
 ; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
 ; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:

>From a26924a1dc673fbb1a9809c203a067fa8665e292 Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Thu, 4 Dec 2025 16:47:56 +0530
Subject: [PATCH 06/11] fixup! Change i-load to iload in accordance with other
 features

Change-Id: I55317e35262890ee9fe1d814f986a3764e4ec675
---
 llvm/lib/Target/RISCV/RISCVFeatures.td                    | 6 ++++--
 llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td                | 2 +-
 .../RISCV/short-forward-branch-opt-load-atomic-acquire.ll | 8 ++++----
 .../short-forward-branch-opt-load-atomic-monotonic.ll     | 8 ++++----
 .../RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll | 8 ++++----
 .../RISCV/short-forward-branch-opt-load-volatile.ll       | 8 ++++----
 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll  | 8 ++++----
 7 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 5c8a617932e34..dd7f36136ea4e 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1898,6 +1898,7 @@ def TuneNoDefaultUnroll
 // - IALU: RVI Integer instructions, plus ANDN/ORN/XNOR (Zbb/Zbkb)
 // - IMinMax: Zbb MIN(U)/MAX(U)
 // - IMul: MUL
+// - ILoad: LB(U)/LH(U)/LW(U)/LD
 //
 // We make the simplifying assumption that any microarches that implement
 // any "short forward branches" can do the IALU fusions, and can opt into
@@ -1930,9 +1931,10 @@ def HasShortForwardBranchIMul : Predicate<"Subtarget->hasShortForwardBranchIMul(
 
 
 def TuneShortForwardBranchILoad
-    : SubtargetFeature<"short-forward-branch-i-load", "HasShortForwardBranchILoad",
+    : SubtargetFeature<"short-forward-branch-iload", "HasShortForwardBranchILoad",
                        "true", "Enable short forward branch optimization for load instructions",
-                       [TuneShortForwardBranchOpt]>;
+                       [TuneShortForwardBranchIALU]>;
+def HasShortForwardBranchILoad : Predicate<"Subtarget->hasShortForwardBranchILoad()">;
 
 // Some subtargets require a S2V transfer buffer to move scalars into vectors.
 // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index c97b60452148e..e83246a82b28e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -178,7 +178,7 @@ def PseudoCCMINU : SFBALU_rr;
 let Predicates = [HasShortForwardBranchIMul] in
 def PseudoCCMUL : SFBALU_rr;
 
-let Predicates = [HasShortForwardBranchIMul] in {
+let Predicates = [HasShortForwardBranchILoad] in {
 def PseudoCCLB : SFBLoad;
 def PseudoCCLH : SFBLoad;
 def PseudoCCLW : SFBLoad;
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
index 51f2643c94191..2c4a542553889 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
 define i32 @test_i8_s_3(ptr %base, i1 %x, i32 %b) {
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
index c2564e6ac654f..781ae15b3f20a 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
 define i32 @test_i8_s_2(ptr %base, i1 %x, i32 %b) {
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
index 9308fa38d95bd..c558931eb5a48 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
 define i32 @test_i8_s_4(ptr %base, i1 %x, i32 %b) {
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
index ebdf25c66fd77..37f7a3020b820 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
 define i32 @test_i8_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
index 5fc3433458d50..6c500468bb187 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-i-load | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
 define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) {

>From 5f769e99376266dc7506dea5eb495c9e3c322285 Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Fri, 5 Dec 2025 11:46:36 +0530
Subject: [PATCH 07/11] fixup! Use cloneMemRefs correctly and improve tests

Change-Id: I498d6cbdfb6f59aab399d3ee73e2eb0bc0258a55
---
 .../Target/RISCV/RISCVExpandPseudoInsts.cpp   |    3 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |    1 +
 ...-forward-branch-opt-load-atomic-acquire.ll | 4746 +++-------------
 ...orward-branch-opt-load-atomic-monotonic.ll | 4628 +++-------------
 ...-forward-branch-opt-load-atomic-seq_cst.ll | 4860 +++--------------
 .../short-forward-branch-opt-load-volatile.ll |   22 +-
 .../RISCV/short-forward-branch-opt-load.ll    |   44 +-
 7 files changed, 2391 insertions(+), 11913 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index a18aad25ae745..55efead1ad887 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -294,8 +294,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     } else {
       BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg)
           .add(MI.getOperand(5))
-          .add(MI.getOperand(6))
-          .cloneMemRefs(MI);
+          .add(MI.getOperand(6));
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 68d4c31626ee2..1940d36af4dcd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -967,6 +967,7 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
   for (unsigned i = 1, e = DefDesc.getNumOperands(); i != e; ++i)
     NewMI.add(LoadMI.getOperand(i));
 
+  NewMI.cloneMemRefs(LoadMI);
   return NewMI;
 }
 
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
index 2c4a542553889..1ba01ac5225d3 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
@@ -1,200 +1,80 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
-define i32 @test_i8_s_3(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i8_s_3(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s1, .LBB0_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB0_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB0_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB0_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 24
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB0_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB0_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -204,185 +84,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z_3(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i8_z_3(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s1, .LBB1_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB1_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB1_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB1_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    zext.b s0, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB1_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB1_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -392,191 +158,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s_3(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i16_s_3(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s1, .LBB2_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB2_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB2_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB2_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB2_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB2_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -586,191 +232,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z_3(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i16_z_3(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s1, .LBB3_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB3_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB3_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB3_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srli s0, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB3_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB3_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s0, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -780,179 +306,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32_3(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i32_3(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i32_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 16
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_4
-; RV32I-NEXT:    bnez s1, .LBB4_2
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB4_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB4_2: # %entry
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 16
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_4
-; RV64I-NEXT:    bnez s1, .LBB4_2
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB4_2: # %entry
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 16
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_4
-; RV32I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB4_2: # %entry
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 16
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_4
-; RV64I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB4_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV32I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
@@ -961,257 +379,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_s_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i8_s_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB5_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB5_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_store_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB5_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB5_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s2, a0, 24
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB5_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_store_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB5_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s2, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -1222,251 +460,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i8_z_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB6_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_store_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB6_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB6_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB6_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    zext.b s2, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB6_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_store_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB6_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -1477,257 +541,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i16_s_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB7_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_store_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB7_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB7_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB7_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s2, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB7_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_store_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB7_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s2, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -1738,257 +622,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i16_z_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB8_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB8_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_store_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB8_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB8_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srli s2, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB8_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_store_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB8_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s2, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -1999,245 +703,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i32_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i32_store_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 16
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_4
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    bnez s3, .LBB9_2
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB9_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB9_2: # %entry
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_store_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 16
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_4
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB9_2
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB9_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB9_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_store_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 16
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_4
-; RV32I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB9_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_store_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 16
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_4
-; RV64I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB9_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_store_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV32I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_store_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
@@ -2247,218 +783,86 @@ entry:
   ret i32 %res
 }
 
-define i64 @test_i8_s_1_3(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i8_s_1_3(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_1_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s2, .LBB10_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 24
-; RV32I-NEXT:  .LBB10_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_1_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB10_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB10_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB10_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_1_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s1, a0, 24
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB10_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB10_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_1_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB10_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_1_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s1, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_1_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB10_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -2468,212 +872,83 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_1_3(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i8_z_1_3(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_1_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s2, .LBB11_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB11_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB11_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_1_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB11_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB11_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB11_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_1_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB11_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB11_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB11_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    zext.b s1, a0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB11_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_1_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB11_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_1_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s1, a0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_1_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -2683,218 +958,86 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_1_3(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i16_s_1_3(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_1_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s2, .LBB12_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 16
-; RV32I-NEXT:  .LBB12_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB12_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB12_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_1_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB12_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB12_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB12_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_1_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s1, a0, 16
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB12_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB12_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_1_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB12_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_1_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s1, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_1_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB12_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -2904,218 +1047,83 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_1_3(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i16_z_1_3(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_1_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s2, .LBB13_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB13_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB13_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_1_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB13_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB13_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB13_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_1_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB13_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB13_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB13_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srli s1, a0, 16
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB13_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_1_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB13_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_1_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_1_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -3125,19 +1133,14 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i32_z_1_3:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a3
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    andi s2, a1, 1
@@ -3157,24 +1160,15 @@ define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_1_3:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    mv s0, a2
 ; RV64I-NEXT:    andi s1, a1, 1
 ; RV64I-NEXT:    addi a1, a0, 16
@@ -3190,25 +1184,16 @@ define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
 ; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_1_3:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFB-NEXT:    mv s0, a3
 ; RV32I-SFB-NEXT:    mv s1, a2
 ; RV32I-SFB-NEXT:    andi s2, a1, 1
@@ -3231,24 +1216,15 @@ define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_1_3:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
 ; RV64I-SFB-NEXT:    mv s0, a2
 ; RV64I-SFB-NEXT:    andi s1, a1, 1
 ; RV64I-SFB-NEXT:    addi a1, a0, 16
@@ -3264,25 +1240,16 @@ define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
 ; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_1_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFBILOAD-NEXT:    mv s0, a3
 ; RV32I-SFBILOAD-NEXT:    mv s1, a2
 ; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
@@ -3305,24 +1272,15 @@ define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_1_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
 ; RV64I-SFBILOAD-NEXT:    mv s0, a2
 ; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
@@ -3338,11 +1296,7 @@ define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
@@ -3352,19 +1306,14 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_1_3(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i64_1_3(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i64_1_3:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a3
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    andi s2, a1, 1
@@ -3380,55 +1329,27 @@ define i64 @test_i64_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
 ; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i64_1_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 32
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_8
-; RV64I-NEXT:    bnez s1, .LBB15_2
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB15_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB15_2: # %entry
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_1_3:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFB-NEXT:    mv s0, a3
 ; RV32I-SFB-NEXT:    mv s1, a2
 ; RV32I-SFB-NEXT:    andi s2, a1, 1
@@ -3447,55 +1368,27 @@ define i64 @test_i64_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
 ; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i64_1_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 32
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_8
-; RV64I-SFB-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB15_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_1_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFBILOAD-NEXT:    mv s0, a3
 ; RV32I-SFBILOAD-NEXT:    mv s1, a2
 ; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
@@ -3514,41 +1407,18 @@ define i64 @test_i64_1_3(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_1_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
@@ -3557,302 +1427,95 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_s_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i8_s_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_64_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB16_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 24
-; RV32I-NEXT:  .LBB16_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB16_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB16_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_store_64_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB16_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB16_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB16_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_64_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s4, a0, 24
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB16_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB16_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_store_64_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB16_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s4, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -3863,296 +1526,92 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i8_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_64_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB17_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB17_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB17_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_store_64_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB17_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB17_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB17_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_64_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB17_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB17_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    zext.b s4, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB17_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_store_64_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB17_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB17_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s4, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -4163,302 +1622,95 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i16_s_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_64_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB18_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 16
-; RV32I-NEXT:  .LBB18_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB18_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB18_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_store_64_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB18_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB18_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB18_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_64_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s4, a0, 16
-; RV32I-SFB-NEXT:  .LBB18_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB18_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s3, a0, 31
-; RV32I-SFB-NEXT:  .LBB18_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_64_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB18_2
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB18_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB18_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s4, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -4469,302 +1721,92 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i16_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_64_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB19_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB19_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB19_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_store_64_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB19_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB19_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB19_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_64_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB19_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srli s4, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB19_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_store_64_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB19_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s4, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -4775,11 +1817,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i32_z_store_64_3:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4787,13 +1828,6 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
 ; RV32I-NEXT:    mv s2, a6
 ; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    mv s4, a4
@@ -4821,31 +1855,17 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_store_64_3:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
 ; RV64I-NEXT:    mv s1, a4
 ; RV64I-NEXT:    mv s2, a3
 ; RV64I-NEXT:    mv s0, a2
@@ -4866,19 +1886,12 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
 ; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_store_64_3:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4886,13 +1899,6 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFB-NEXT:    mv s0, a6
 ; RV32I-SFB-NEXT:    mv s1, a5
 ; RV32I-SFB-NEXT:    mv s2, a4
@@ -4923,31 +1929,17 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_store_64_3:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
 ; RV64I-SFB-NEXT:    mv s0, a4
 ; RV64I-SFB-NEXT:    mv s1, a3
 ; RV64I-SFB-NEXT:    mv s2, a2
@@ -4968,19 +1960,12 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
 ; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4988,13 +1973,6 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFBILOAD-NEXT:    mv s0, a6
 ; RV32I-SFBILOAD-NEXT:    mv s1, a5
 ; RV32I-SFBILOAD-NEXT:    mv s2, a4
@@ -5025,31 +2003,17 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
 ; RV64I-SFBILOAD-NEXT:    mv s0, a4
 ; RV64I-SFBILOAD-NEXT:    mv s1, a3
 ; RV64I-SFBILOAD-NEXT:    mv s2, a2
@@ -5070,13 +2034,7 @@ define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
@@ -5087,11 +2045,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i64_store_64_3:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5099,13 +2056,6 @@ define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
 ; RV32I-NEXT:    mv s2, a6
 ; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    mv s4, a4
@@ -5129,61 +2079,24 @@ define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i64_store_64_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 32
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    call __atomic_load_8
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB21_2
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB21_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB21_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_store_64_3:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5191,13 +2104,6 @@ define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFB-NEXT:    mv s0, a6
 ; RV32I-SFB-NEXT:    mv s1, a5
 ; RV32I-SFB-NEXT:    mv s2, a4
@@ -5224,61 +2130,24 @@ define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i64_store_64_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 32
-; RV64I-SFB-NEXT:    li a1, 2
-; RV64I-SFB-NEXT:    call __atomic_load_8
-; RV64I-SFB-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB21_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_store_64_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5286,13 +2155,6 @@ define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFBILOAD-NEXT:    mv s0, a6
 ; RV32I-SFBILOAD-NEXT:    mv s1, a5
 ; RV32I-SFBILOAD-NEXT:    mv s2, a4
@@ -5319,55 +2181,19 @@ define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_store_64_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV64I-SFBILOAD-NEXT:    li a1, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
index 781ae15b3f20a..9d3606dca49a8 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
@@ -1,200 +1,74 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
-define i32 @test_i8_s_2(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i8_s_2(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s1, .LBB0_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB0_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB0_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB0_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 24
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB0_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB0_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -204,185 +78,65 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z_2(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i8_z_2(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s1, .LBB1_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB1_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB1_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB1_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    zext.b s0, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB1_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB1_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -392,191 +146,65 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s_2(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i16_s_2(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s1, .LBB2_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB2_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB2_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB2_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB2_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB2_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -586,191 +214,65 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z_2(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i16_z_2(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s1, .LBB3_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB3_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB3_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB3_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srli s0, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB3_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB3_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s0, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -780,179 +282,65 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32_2(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i32_2(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i32_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 16
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_4
-; RV32I-NEXT:    bnez s1, .LBB4_2
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB4_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB4_2: # %entry
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 16
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_4
-; RV64I-NEXT:    bnez s1, .LBB4_2
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB4_2: # %entry
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 16
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_4
-; RV32I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB4_2: # %entry
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 16
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_4
-; RV64I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB4_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV32I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
@@ -961,257 +349,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_s_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i8_s_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB5_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB5_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB5_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB5_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s2, a0, 24
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB5_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB5_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s2, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -1222,251 +424,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i8_z_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB6_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB6_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB6_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB6_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    zext.b s2, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB6_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB6_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -1477,257 +499,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i16_s_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB7_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB7_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB7_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB7_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s2, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB7_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB7_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s2, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -1738,257 +574,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i16_z_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB8_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB8_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB8_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB8_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srli s2, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB8_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB8_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s2, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -1999,245 +649,71 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i32_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i32_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 16
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_4
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    bnez s3, .LBB9_2
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB9_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB9_2: # %entry
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 16
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_4
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB9_2
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB9_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB9_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 16
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_4
-; RV32I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB9_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 16
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_4
-; RV64I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB9_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV32I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
@@ -2247,218 +723,80 @@ entry:
   ret i32 %res
 }
 
-define i64 @test_i8_s_1_2(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i8_s_1_2(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s2, .LBB10_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 24
-; RV32I-NEXT:  .LBB10_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB10_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB10_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB10_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s1, a0, 24
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB10_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB10_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB10_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s1, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB10_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -2468,212 +806,77 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_1_2(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i8_z_1_2(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s2, .LBB11_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB11_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB11_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB11_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB11_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB11_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB11_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB11_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB11_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    zext.b s1, a0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB11_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB11_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s1, a0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -2683,218 +886,80 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_1_2(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i16_s_1_2(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s2, .LBB12_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 16
-; RV32I-NEXT:  .LBB12_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB12_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB12_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB12_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB12_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB12_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s1, a0, 16
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB12_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB12_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB12_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s1, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB12_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -2904,218 +969,77 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_1_2(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i16_z_1_2(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s2, .LBB13_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    bnez a1, .LBB13_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB13_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB13_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB13_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB13_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB13_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB13_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB13_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srli s1, a0, 16
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB13_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB13_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -3125,19 +1049,14 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i32_z_1_2:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a3
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    andi s2, a1, 1
@@ -3157,24 +1076,15 @@ define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_1_2:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    mv s0, a2
 ; RV64I-NEXT:    andi s1, a1, 1
 ; RV64I-NEXT:    addi a1, a0, 16
@@ -3190,25 +1100,16 @@ define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
 ; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFB-NEXT:    mv s0, a3
 ; RV32I-SFB-NEXT:    mv s1, a2
 ; RV32I-SFB-NEXT:    andi s2, a1, 1
@@ -3231,24 +1132,15 @@ define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
 ; RV64I-SFB-NEXT:    mv s0, a2
 ; RV64I-SFB-NEXT:    andi s1, a1, 1
 ; RV64I-SFB-NEXT:    addi a1, a0, 16
@@ -3264,25 +1156,16 @@ define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
 ; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFBILOAD-NEXT:    mv s0, a3
 ; RV32I-SFBILOAD-NEXT:    mv s1, a2
 ; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
@@ -3305,24 +1188,15 @@ define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
 ; RV64I-SFBILOAD-NEXT:    mv s0, a2
 ; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
@@ -3338,11 +1212,7 @@ define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
@@ -3352,19 +1222,14 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_1_2(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i64_1_2(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i64_1_2:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a3
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    andi s2, a1, 1
@@ -3380,55 +1245,26 @@ define i64 @test_i64_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
 ; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i64_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 32
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_8
-; RV64I-NEXT:    bnez s1, .LBB15_2
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    bnez a1, .LBB15_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB15_2: # %entry
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFB-NEXT:    mv s0, a3
 ; RV32I-SFB-NEXT:    mv s1, a2
 ; RV32I-SFB-NEXT:    andi s2, a1, 1
@@ -3447,55 +1283,26 @@ define i64 @test_i64_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
 ; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i64_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 32
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_8
-; RV64I-SFB-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB15_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFBILOAD-NEXT:    mv s0, a3
 ; RV32I-SFBILOAD-NEXT:    mv s1, a2
 ; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
@@ -3514,41 +1321,17 @@ define i64 @test_i64_1_2(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
@@ -3557,302 +1340,89 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_s_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i8_s_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB16_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 24
-; RV32I-NEXT:  .LBB16_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB16_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB16_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB16_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB16_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB16_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s4, a0, 24
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB16_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB16_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB16_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s4, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -3863,296 +1433,86 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i8_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB17_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB17_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB17_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB17_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB17_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB17_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB17_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB17_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    zext.b s4, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB17_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB17_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB17_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s4, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -4163,302 +1523,89 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i16_s_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB18_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 16
-; RV32I-NEXT:  .LBB18_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB18_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB18_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB18_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB18_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB18_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s4, a0, 16
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB18_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s3, a0, 31
-; RV32I-SFB-NEXT:  .LBB18_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_64_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB18_2
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB18_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s4, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -4469,302 +1616,86 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i16_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB19_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB19_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB19_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB19_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB19_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB19_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB19_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srli s4, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB19_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB19_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s4, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -4775,11 +1706,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i32_z_store_64_2:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4787,13 +1717,6 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
 ; RV32I-NEXT:    mv s2, a6
 ; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    mv s4, a4
@@ -4821,31 +1744,17 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_store_64_2:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
 ; RV64I-NEXT:    mv s1, a4
 ; RV64I-NEXT:    mv s2, a3
 ; RV64I-NEXT:    mv s0, a2
@@ -4866,19 +1775,12 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
 ; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4886,13 +1788,6 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFB-NEXT:    mv s0, a6
 ; RV32I-SFB-NEXT:    mv s1, a5
 ; RV32I-SFB-NEXT:    mv s2, a4
@@ -4923,31 +1818,17 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
 ; RV64I-SFB-NEXT:    mv s0, a4
 ; RV64I-SFB-NEXT:    mv s1, a3
 ; RV64I-SFB-NEXT:    mv s2, a2
@@ -4968,19 +1849,12 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
 ; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4988,13 +1862,6 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFBILOAD-NEXT:    mv s0, a6
 ; RV32I-SFBILOAD-NEXT:    mv s1, a5
 ; RV32I-SFBILOAD-NEXT:    mv s2, a4
@@ -5025,31 +1892,17 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
 ; RV64I-SFBILOAD-NEXT:    mv s0, a4
 ; RV64I-SFBILOAD-NEXT:    mv s1, a3
 ; RV64I-SFBILOAD-NEXT:    mv s2, a2
@@ -5070,13 +1923,7 @@ define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
@@ -5087,11 +1934,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i64_store_64_2:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5099,13 +1945,6 @@ define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
 ; RV32I-NEXT:    mv s2, a6
 ; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    mv s4, a4
@@ -5129,61 +1968,23 @@ define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i64_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 32
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_8
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB21_2
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB21_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB21_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5191,13 +1992,6 @@ define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFB-NEXT:    mv s0, a6
 ; RV32I-SFB-NEXT:    mv s1, a5
 ; RV32I-SFB-NEXT:    mv s2, a4
@@ -5224,61 +2018,23 @@ define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i64_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 32
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_8
-; RV64I-SFB-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB21_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5286,13 +2042,6 @@ define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFBILOAD-NEXT:    mv s0, a6
 ; RV32I-SFBILOAD-NEXT:    mv s1, a5
 ; RV32I-SFBILOAD-NEXT:    mv s2, a4
@@ -5319,55 +2068,18 @@ define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
index c558931eb5a48..f4aa40185ed9c 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
@@ -1,200 +1,86 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
-define i32 @test_i8_s_4(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i8_s_4(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s1, .LBB0_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB0_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB0_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB0_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 24
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB0_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB0_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -204,185 +90,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z_4(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i8_z_4(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s1, .LBB1_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB1_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB1_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB1_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    zext.b s0, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB1_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB1_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -392,191 +170,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s_4(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i16_s_4(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s1, .LBB2_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB2_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB2_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB2_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB2_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB2_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -586,191 +250,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z_4(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i16_z_4(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s1, .LBB3_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB3_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB3_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB3_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srli s0, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB3_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB3_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s0, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -780,179 +330,77 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32_4(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i32_4(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i32_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s1, a1, 1
-; RV32I-NEXT:    addi a0, a0, 16
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_4
-; RV32I-NEXT:    bnez s1, .LBB4_2
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB4_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB4_2: # %entry
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 16
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_4
-; RV64I-NEXT:    bnez s1, .LBB4_2
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB4_2: # %entry
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    andi s1, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 16
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_4
-; RV32I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB4_2: # %entry
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 16
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_4
-; RV64I-SFB-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB4_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV32I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
@@ -961,257 +409,83 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_s_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i8_s_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB5_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB5_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_store_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB5_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB5_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s2, a0, 24
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB5_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_store_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB5_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s2, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB5_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -1222,251 +496,83 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i8_z_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB6_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_store_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB6_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB6_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB6_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    zext.b s2, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB6_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_store_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB6_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB6_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -1477,257 +583,83 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i16_s_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB7_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_store_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB7_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB7_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB7_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s2, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB7_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_store_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB7_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s2, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB7_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -1738,257 +670,83 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i16_z_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB8_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB8_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_store_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB8_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB8_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srli s2, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB8_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_store_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB8_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s2, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB8_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -1999,245 +757,83 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i32_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i32_store_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    andi s3, a1, 1
-; RV32I-NEXT:    addi a0, a0, 16
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_4
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    bnez s3, .LBB9_2
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB9_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB9_2: # %entry
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_store_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 16
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_4
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB9_2
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB9_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB9_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_store_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    andi s3, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 16
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_4
-; RV32I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB9_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_store_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 16
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_4
-; RV64I-SFB-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB9_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_store_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV32I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_store_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB9_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
@@ -2247,218 +843,92 @@ entry:
   ret i32 %res
 }
 
-define i64 @test_i8_s_1_4(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i8_s_1_4(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_1_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s2, .LBB10_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 24
-; RV32I-NEXT:  .LBB10_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_1_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB10_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB10_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB10_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_1_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s1, a0, 24
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB10_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB10_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_1_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB10_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_1_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s1, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB10_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_1_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB10_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -2468,212 +938,89 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_1_4(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i8_z_1_4(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_1_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s2, .LBB11_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB11_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB11_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_1_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB11_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB11_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB11_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_1_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB11_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB11_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB11_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    zext.b s1, a0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB11_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_1_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB11_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_1_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB11_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s1, a0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_1_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB11_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -2683,218 +1030,92 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_1_4(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i16_s_1_4(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_1_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s2, .LBB12_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 16
-; RV32I-NEXT:  .LBB12_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB12_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB12_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_1_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB12_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB12_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB12_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_1_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s1, a0, 16
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB12_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB12_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_1_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB12_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_1_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s1, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB12_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_1_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB12_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB12_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -2904,218 +1125,89 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_1_4(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i16_z_1_4(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_1_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s2, .LBB13_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB13_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB13_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_1_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB13_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB13_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB13_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_1_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    bnez a1, .LBB13_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB13_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB13_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srli s1, a0, 16
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB13_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_1_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB13_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_1_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB13_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_1_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB13_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -3125,19 +1217,14 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i32_z_1_4:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a3
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    andi s2, a1, 1
@@ -3157,24 +1244,15 @@ define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_1_4:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    mv s0, a2
 ; RV64I-NEXT:    andi s1, a1, 1
 ; RV64I-NEXT:    addi a1, a0, 16
@@ -3190,25 +1268,16 @@ define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
 ; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_1_4:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFB-NEXT:    mv s0, a3
 ; RV32I-SFB-NEXT:    mv s1, a2
 ; RV32I-SFB-NEXT:    andi s2, a1, 1
@@ -3231,24 +1300,15 @@ define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_1_4:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
 ; RV64I-SFB-NEXT:    mv s0, a2
 ; RV64I-SFB-NEXT:    andi s1, a1, 1
 ; RV64I-SFB-NEXT:    addi a1, a0, 16
@@ -3264,25 +1324,16 @@ define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
 ; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_1_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFBILOAD-NEXT:    mv s0, a3
 ; RV32I-SFBILOAD-NEXT:    mv s1, a2
 ; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
@@ -3305,24 +1356,15 @@ define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_1_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
 ; RV64I-SFBILOAD-NEXT:    mv s0, a2
 ; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
@@ -3338,11 +1380,7 @@ define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
@@ -3352,19 +1390,14 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_1_4(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i64_1_4(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i64_1_4:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a3
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    andi s2, a1, 1
@@ -3380,55 +1413,28 @@ define i64 @test_i64_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
 ; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i64_1_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a0, a0, 32
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_8
-; RV64I-NEXT:    bnez s1, .LBB15_2
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB15_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB15_2: # %entry
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_1_4:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFB-NEXT:    mv s0, a3
 ; RV32I-SFB-NEXT:    mv s1, a2
 ; RV32I-SFB-NEXT:    andi s2, a1, 1
@@ -3447,55 +1453,28 @@ define i64 @test_i64_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
 ; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i64_1_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 32
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_8
-; RV64I-SFB-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB15_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_1_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 16
 ; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
 ; RV32I-SFBILOAD-NEXT:    mv s0, a3
 ; RV32I-SFBILOAD-NEXT:    mv s1, a2
 ; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
@@ -3514,41 +1493,19 @@ define i64 @test_i64_1_4(ptr %base, i1 %x, i64 %b) {
 ; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_1_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB15_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
@@ -3557,302 +1514,101 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_s_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i8_s_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_64_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB16_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 24
-; RV32I-NEXT:  .LBB16_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB16_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB16_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_store_64_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB16_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB16_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB16_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_64_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s4, a0, 24
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB16_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB16_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_store_64_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB16_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s4, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB16_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB16_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -3863,296 +1619,98 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i8_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_64_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB17_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB17_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB17_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_store_64_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB17_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB17_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB17_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_64_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB17_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB17_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    zext.b s4, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB17_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_store_64_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB17_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB17_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB17_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s4, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB17_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -4163,302 +1721,101 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i16_s_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_64_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB18_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 16
-; RV32I-NEXT:  .LBB18_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB18_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB18_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_store_64_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB18_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB18_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB18_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_64_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s4, a0, 16
-; RV32I-SFB-NEXT:  .LBB18_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB18_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s3, a0, 31
-; RV32I-SFB-NEXT:  .LBB18_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_64_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB18_2
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB18_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB18_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s4, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB18_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB18_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -4469,302 +1826,98 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i16_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_64_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB19_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB19_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB19_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_store_64_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB19_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB19_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB19_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_64_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB19_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srli s4, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB19_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_store_64_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB19_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB19_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s4, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB19_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -4775,11 +1928,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i32_z_store_64_4:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4787,13 +1939,6 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
 ; RV32I-NEXT:    mv s2, a6
 ; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    mv s4, a4
@@ -4821,31 +1966,17 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_store_64_4:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
 ; RV64I-NEXT:    mv s1, a4
 ; RV64I-NEXT:    mv s2, a3
 ; RV64I-NEXT:    mv s0, a2
@@ -4866,19 +1997,12 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
 ; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_store_64_4:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4886,13 +2010,6 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFB-NEXT:    mv s0, a6
 ; RV32I-SFB-NEXT:    mv s1, a5
 ; RV32I-SFB-NEXT:    mv s2, a4
@@ -4923,31 +2040,17 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_store_64_4:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
 ; RV64I-SFB-NEXT:    mv s0, a4
 ; RV64I-SFB-NEXT:    mv s1, a3
 ; RV64I-SFB-NEXT:    mv s2, a2
@@ -4968,19 +2071,12 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
 ; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -4988,13 +2084,6 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFBILOAD-NEXT:    mv s0, a6
 ; RV32I-SFBILOAD-NEXT:    mv s1, a5
 ; RV32I-SFBILOAD-NEXT:    mv s2, a4
@@ -5025,31 +2114,17 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
 ; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
 ; RV64I-SFBILOAD-NEXT:    mv s0, a4
 ; RV64I-SFBILOAD-NEXT:    mv s1, a3
 ; RV64I-SFBILOAD-NEXT:    mv s2, a2
@@ -5070,13 +2145,7 @@ define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c)
 ; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
 ; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
@@ -5087,11 +2156,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i64_store_64_4:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5099,13 +2167,6 @@ define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    .cfi_offset ra, -4
-; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    .cfi_offset s3, -20
-; RV32I-NEXT:    .cfi_offset s4, -24
-; RV32I-NEXT:    .cfi_offset s5, -28
 ; RV32I-NEXT:    mv s2, a6
 ; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    mv s4, a4
@@ -5129,61 +2190,25 @@ define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    .cfi_restore ra
-; RV32I-NEXT:    .cfi_restore s0
-; RV32I-NEXT:    .cfi_restore s1
-; RV32I-NEXT:    .cfi_restore s2
-; RV32I-NEXT:    .cfi_restore s3
-; RV32I-NEXT:    .cfi_restore s4
-; RV32I-NEXT:    .cfi_restore s5
 ; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i64_store_64_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    .cfi_offset ra, -8
-; RV64I-NEXT:    .cfi_offset s0, -16
-; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    .cfi_offset s3, -40
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a0, a0, 32
-; RV64I-NEXT:    li a1, 5
-; RV64I-NEXT:    call __atomic_load_8
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB21_2
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB21_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB21_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    .cfi_restore ra
-; RV64I-NEXT:    .cfi_restore s0
-; RV64I-NEXT:    .cfi_restore s1
-; RV64I-NEXT:    .cfi_restore s2
-; RV64I-NEXT:    .cfi_restore s3
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_store_64_4:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5191,13 +2216,6 @@ define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    .cfi_offset ra, -4
-; RV32I-SFB-NEXT:    .cfi_offset s0, -8
-; RV32I-SFB-NEXT:    .cfi_offset s1, -12
-; RV32I-SFB-NEXT:    .cfi_offset s2, -16
-; RV32I-SFB-NEXT:    .cfi_offset s3, -20
-; RV32I-SFB-NEXT:    .cfi_offset s4, -24
-; RV32I-SFB-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFB-NEXT:    mv s0, a6
 ; RV32I-SFB-NEXT:    mv s1, a5
 ; RV32I-SFB-NEXT:    mv s2, a4
@@ -5224,61 +2242,25 @@ define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    .cfi_restore ra
-; RV32I-SFB-NEXT:    .cfi_restore s0
-; RV32I-SFB-NEXT:    .cfi_restore s1
-; RV32I-SFB-NEXT:    .cfi_restore s2
-; RV32I-SFB-NEXT:    .cfi_restore s3
-; RV32I-SFB-NEXT:    .cfi_restore s4
-; RV32I-SFB-NEXT:    .cfi_restore s5
 ; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i64_store_64_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    .cfi_offset ra, -8
-; RV64I-SFB-NEXT:    .cfi_offset s0, -16
-; RV64I-SFB-NEXT:    .cfi_offset s1, -24
-; RV64I-SFB-NEXT:    .cfi_offset s2, -32
-; RV64I-SFB-NEXT:    .cfi_offset s3, -40
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a0, a0, 32
-; RV64I-SFB-NEXT:    li a1, 5
-; RV64I-SFB-NEXT:    call __atomic_load_8
-; RV64I-SFB-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    andi a1, a1, 1
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB21_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    .cfi_restore ra
-; RV64I-SFB-NEXT:    .cfi_restore s0
-; RV64I-SFB-NEXT:    .cfi_restore s1
-; RV64I-SFB-NEXT:    .cfi_restore s2
-; RV64I-SFB-NEXT:    .cfi_restore s3
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_store_64_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 32
 ; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
@@ -5286,13 +2268,6 @@ define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    .cfi_offset ra, -4
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s0, -8
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s1, -12
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s2, -16
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s3, -20
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s4, -24
-; RV32I-SFBILOAD-NEXT:    .cfi_offset s5, -28
 ; RV32I-SFBILOAD-NEXT:    mv s0, a6
 ; RV32I-SFBILOAD-NEXT:    mv s1, a5
 ; RV32I-SFBILOAD-NEXT:    mv s2, a4
@@ -5319,55 +2294,20 @@ define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
 ; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s4
-; RV32I-SFBILOAD-NEXT:    .cfi_restore s5
 ; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_store_64_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    .cfi_offset ra, -8
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s0, -16
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s1, -24
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s2, -32
-; RV64I-SFBILOAD-NEXT:    .cfi_offset s3, -40
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV64I-SFBILOAD-NEXT:    li a1, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB21_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    .cfi_restore ra
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s0
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s1
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s2
-; RV64I-SFBILOAD-NEXT:    .cfi_restore s3
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
index 37f7a3020b820..90899c690516a 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
@@ -10,7 +10,7 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
-define i32 @test_i8_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+define i32 @test_i8_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i8_s_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lb a4, 4(a0)
@@ -90,7 +90,7 @@ entry:
   ret i32 %res1
 }
 
-define i32 @test_i8_z_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+define i32 @test_i8_z_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i8_z_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lbu a4, 4(a0)
@@ -170,7 +170,7 @@ entry:
   ret i32 %res1
 }
 
-define i32 @test_i16_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+define i32 @test_i16_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i16_s_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lh a4, 8(a0)
@@ -250,7 +250,7 @@ entry:
   ret i32 %res1
 }
 
-define i32 @test_i16_z_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+define i32 @test_i16_z_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i16_z_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lhu a4, 8(a0)
@@ -330,7 +330,7 @@ entry:
   ret i32 %res1
 }
 
-define i32 @test_i32_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) {
+define i32 @test_i32_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i32_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a4, 16(a0)
@@ -410,7 +410,7 @@ entry:
 }
 
 
-define i64 @test_i8_s_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+define i64 @test_i8_s_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i8_s_1_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lb a6, 4(a0)
@@ -514,7 +514,7 @@ entry:
   ret i64 %res1
 }
 
-define i64 @test_i8_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+define i64 @test_i8_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i8_z_1_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lbu a6, 4(a0)
@@ -614,7 +614,7 @@ entry:
   ret i64 %res1
 }
 
-define i64 @test_i16_s_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+define i64 @test_i16_s_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i16_s_1_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lh a6, 8(a0)
@@ -718,7 +718,7 @@ entry:
   ret i64 %res1
 }
 
-define i64 @test_i16_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+define i64 @test_i16_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i16_z_1_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lhu a6, 8(a0)
@@ -818,7 +818,7 @@ entry:
   ret i64 %res1
 }
 
-define i64 @test_i32_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+define i64 @test_i32_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i32_z_1_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a6, 16(a0)
@@ -918,7 +918,7 @@ entry:
   ret i64 %res1
 }
 
-define i64 @test_i64_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) {
+define i64 @test_i64_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
 ; RV32I-LABEL: test_i64_1_volatile:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a7, 32(a0)
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
index 6c500468bb187..984e101e8a937 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
@@ -10,7 +10,7 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
-define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_s:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -78,7 +78,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_z:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -146,7 +146,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_s:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -214,7 +214,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_z:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -282,7 +282,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32(ptr %base, i1 %x, i32 %b) {
+define i32 @test_i32(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i32:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -349,7 +349,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lb a0, 4(a0)
@@ -424,7 +424,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lbu a0, 4(a0)
@@ -499,7 +499,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lh a0, 8(a0)
@@ -574,7 +574,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lhu a0, 8(a0)
@@ -649,7 +649,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) {
+define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i32_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a0, 16(a0)
@@ -723,7 +723,7 @@ entry:
   ret i32 %res
 }
 
-define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_1:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -805,7 +805,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_1:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -885,7 +885,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_1:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -967,7 +967,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_1:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -1047,7 +1047,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_1(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i32_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i32_z_1:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -1127,7 +1127,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) {
+define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i64_1:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    andi a1, a1, 1
@@ -1208,7 +1208,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lb a0, 4(a0)
@@ -1301,7 +1301,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lbu a0, 4(a0)
@@ -1391,7 +1391,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lh a0, 8(a0)
@@ -1484,7 +1484,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lhu a0, 8(a0)
@@ -1574,7 +1574,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i32_z_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a0, 16(a0)
@@ -1664,7 +1664,7 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) {
+define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i64_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    mv a7, a1

>From f959b128506a1c6ef0d04c7e993ce6720a47f4eb Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Mon, 8 Dec 2025 11:20:54 +0530
Subject: [PATCH 08/11] fixup! Address comments

Change-Id: I80edab151862240167562e535385b207b811e546
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |    3 +-
 ...-branch-opt-load-atomic-acquire-seq_cst.ll | 4284 ++++++++++++++
 ...-forward-branch-opt-load-atomic-acquire.ll | 2205 -------
 ...orward-branch-opt-load-atomic-monotonic.ll | 2091 -------
 ...-forward-branch-opt-load-atomic-seq_cst.ll | 2319 --------
 .../short-forward-branch-opt-load-volatile.ll | 1022 ----
 .../RISCV/short-forward-branch-opt-load.ll    | 5115 ++++++++++++++++-
 7 files changed, 9218 insertions(+), 7821 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire-seq_cst.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1940d36af4dcd..11688476a2554 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -934,8 +934,7 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
     return nullptr;
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  bool Invert =
-      (MRI.getVRegDef(MI.getOperand(4).getReg()) == &LoadMI) ? true : false;
+  bool Invert = MRI.getVRegDef(MI.getOperand(4).getReg()) == &LoadMI;
   MachineOperand FalseReg = MI.getOperand(Invert ? 5 : 4);
   Register DestReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire-seq_cst.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire-seq_cst.ll
new file mode 100644
index 0000000000000..d4e418ebb8fd3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire-seq_cst.ll
@@ -0,0 +1,4284 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-ialu | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-ialu | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-iload | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-iload | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
+
+define i32 @test_i8_s_3(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i8_s_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB0_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_3(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i8_z_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB1_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB1_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_3(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i16_s_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB2_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB2_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_3(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i16_z_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB3_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB3_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_3(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i32_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB4_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB4_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB4_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB4_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr acquire, align 4          ; load 32-bit value
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_s_store_3(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i8_s_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB5_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_store_3(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i8_z_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB6_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB6_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_store_3(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i16_s_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB7_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB7_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_store_3(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i16_z_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB8_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB8_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB8_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_store_3(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i32_store_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_store_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB9_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB9_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_store_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB9_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_store_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB9_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_store_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_store_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr acquire, align 4          ; load 32-bit value
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i64 @test_i8_s_1_3(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i8_s_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB10_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB10_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB10_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB10_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB10_4: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB10_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB10_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_1_3(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i8_z_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB11_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB11_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB11_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB11_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB11_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB11_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB11_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB11_4: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB11_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB11_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_1_3(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i16_s_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB12_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB12_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB12_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB12_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB12_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB12_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB12_4: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB12_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB12_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_1_3(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i16_z_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB13_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB13_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB13_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB13_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB13_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB13_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB13_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB13_4: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB13_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB13_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_1_3(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i32_z_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    addi a2, sp, 12
+; RV32I-NEXT:    li a3, 2
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    beqz s2, .LBB14_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:  .LBB14_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 2
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    beqz s1, .LBB14_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:  .LBB14_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    addi a2, sp, 12
+; RV32I-SFB-NEXT:    li a3, 2
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 12(sp)
+; RV32I-SFB-NEXT:    bnez s2, .LBB14_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB14_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB14_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 2
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s1, .LBB14_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB14_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
+; RV32I-SFBILOAD-NEXT:    li a3, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB14_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr acquire, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_1_3(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i64_1_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    bnez s2, .LBB15_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB15_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB15_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB15_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    bnez s2, .LBB15_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB15_2: # %entry
+; RV32I-SFB-NEXT:    bnez s2, .LBB15_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:  .LBB15_4: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB15_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr acquire, align 8          ; load 64-bit value
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_s_store_64_3(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i8_s_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB16_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB16_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB16_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB16_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB16_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB16_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB16_4: # %entry
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB16_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_store_64_3(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i8_z_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB17_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB17_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB17_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB17_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB17_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB17_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB17_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB17_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB17_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_store_64_3(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i16_s_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB18_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB18_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB18_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB18_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB18_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB18_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB18_4: # %entry
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB18_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_store_64_3(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i16_z_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB19_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB19_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB19_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB19_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB19_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB19_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB19_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_store_64_3(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i32_z_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s3, a6
+; RV32I-NEXT:    mv s4, a5
+; RV32I-NEXT:    mv s5, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s2, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    mv a2, sp
+; RV32I-NEXT:    li a3, 2
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    sw s4, 0(s5)
+; RV32I-NEXT:    sw s3, 4(s5)
+; RV32I-NEXT:    bnez s1, .LBB20_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:  .LBB20_2: # %entry
+; RV32I-NEXT:    addi a1, s1, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 2
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    lwu a0, 4(sp)
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB20_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB20_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    mv a2, sp
+; RV32I-SFB-NEXT:    li a3, 2
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 0(sp)
+; RV32I-SFB-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB20_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB20_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 2
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB20_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    mv a2, sp
+; RV32I-SFBILOAD-NEXT:    li a3, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 2
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr acquire, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_store_64_3(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i64_store_64_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB21_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB21_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_store_64_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB21_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB21_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_store_64_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 2
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    bnez s5, .LBB21_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB21_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB21_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:  .LBB21_4: # %entry
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_store_64_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB21_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_store_64_3:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 2
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:  .LBB21_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_store_64_3:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr acquire, align 8          ; load 64-bit value
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
+define i32 @test_i8_s_4(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i8_s_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB22_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB22_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB22_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB22_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB22_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB22_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB22_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB22_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB22_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB22_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB22_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB22_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_4(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i8_z_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB23_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB23_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB23_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB23_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB23_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB23_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB23_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB23_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB23_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB23_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB23_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB23_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_4(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i16_s_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB24_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB24_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB24_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB24_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB24_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB24_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB24_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB24_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB24_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB24_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB24_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB24_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_4(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i16_z_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB25_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB25_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB25_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB25_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB25_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB25_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB25_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB25_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB25_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB25_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB25_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB25_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_4(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i32_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB26_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB26_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB26_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB26_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB26_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB26_2: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB26_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB26_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB26_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB26_2: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB26_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB26_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr seq_cst, align 4          ; load 32-bit value
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_s_store_4(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i8_s_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB27_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB27_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB27_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB27_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB27_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB27_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB27_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB27_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB27_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB27_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB27_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB27_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_store_4(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i8_z_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB28_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB28_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB28_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB28_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB28_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB28_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB28_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB28_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB28_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB28_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB28_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB28_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_store_4(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i16_s_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB29_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB29_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB29_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB29_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB29_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB29_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB29_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB29_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB29_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB29_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB29_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB29_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_store_4(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i16_z_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB30_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB30_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB30_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB30_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB30_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB30_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB30_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB30_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB30_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB30_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB30_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB30_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_store_4(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i32_store_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB31_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB31_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_store_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB31_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB31_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_store_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    bnez a1, .LBB31_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB31_2: # %entry
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_store_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB31_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB31_2: # %entry
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_store_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB31_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB31_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_store_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB31_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB31_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr seq_cst, align 4          ; load 32-bit value
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i64 @test_i8_s_1_4(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i8_s_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB32_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB32_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB32_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB32_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB32_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB32_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB32_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB32_4: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB32_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB32_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB32_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB32_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB32_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB32_4: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB32_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB32_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_1_4(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i8_z_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB33_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB33_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB33_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB33_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB33_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB33_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB33_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB33_4: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB33_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB33_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB33_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB33_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB33_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB33_4: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB33_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB33_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_1_4(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i16_s_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB34_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB34_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB34_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB34_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB34_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB34_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB34_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB34_4: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB34_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB34_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB34_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB34_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB34_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB34_4: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB34_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB34_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_1_4(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i16_z_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB35_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB35_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB35_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB35_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB35_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB35_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB35_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB35_4: # %entry
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB35_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB35_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB35_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB35_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB35_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB35_4: # %entry
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB35_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB35_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_1_4(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i32_z_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    addi a2, sp, 12
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    beqz s2, .LBB36_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:  .LBB36_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    beqz s1, .LBB36_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:  .LBB36_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    addi a2, sp, 12
+; RV32I-SFB-NEXT:    li a3, 5
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 12(sp)
+; RV32I-SFB-NEXT:    bnez s2, .LBB36_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB36_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB36_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB36_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 5
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s1, .LBB36_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB36_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
+; RV32I-SFBILOAD-NEXT:    li a3, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB36_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:  .LBB36_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB36_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB36_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB36_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:  .LBB36_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr seq_cst, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_1_4(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i64_1_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    bnez s2, .LBB37_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB37_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB37_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB37_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    bnez s2, .LBB37_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB37_2: # %entry
+; RV32I-SFB-NEXT:    bnez s2, .LBB37_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:  .LBB37_4: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB37_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB37_2: # %entry
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB37_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:  .LBB37_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB37_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:  .LBB37_4: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB37_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB37_2: # %entry
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr seq_cst, align 8          ; load 64-bit value
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_s_store_64_4(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i8_s_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB38_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB38_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB38_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB38_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    beqz a1, .LBB38_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB38_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB38_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB38_4: # %entry
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB38_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB38_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB38_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB38_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB38_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB38_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB38_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB38_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_store_64_4(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i8_z_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB39_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB39_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB39_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB39_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB39_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB39_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB39_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB39_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB39_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB39_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB39_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB39_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB39_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB39_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB39_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB39_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_store_64_4(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i16_s_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB40_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB40_2:
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB40_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB40_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    beqz a1, .LBB40_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB40_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB40_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB40_4: # %entry
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB40_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB40_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB40_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB40_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB40_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB40_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB40_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB40_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_store_64_4(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i16_z_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB41_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:  .LBB41_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB41_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB41_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB41_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:  .LBB41_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB41_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB41_4: # %entry
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB41_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB41_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB41_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB41_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB41_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:  .LBB41_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB41_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB41_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_store_64_4(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i32_z_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s3, a6
+; RV32I-NEXT:    mv s4, a5
+; RV32I-NEXT:    mv s5, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s2, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    mv a2, sp
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    sw s4, 0(s5)
+; RV32I-NEXT:    sw s3, 4(s5)
+; RV32I-NEXT:    bnez s1, .LBB42_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:  .LBB42_2: # %entry
+; RV32I-NEXT:    addi a1, s1, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    lwu a0, 4(sp)
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB42_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB42_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    mv a2, sp
+; RV32I-SFB-NEXT:    li a3, 5
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 0(sp)
+; RV32I-SFB-NEXT:    beqz s5, .LBB42_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB42_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB42_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB42_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 5
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s3, .LBB42_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB42_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    mv a2, sp
+; RV32I-SFBILOAD-NEXT:    li a3, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB42_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB42_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB42_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB42_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 5
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB42_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB42_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr seq_cst, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_store_64_4(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i64_store_64_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB43_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB43_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_store_64_4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB43_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:  .LBB43_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_store_64_4:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 5
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    bnez s5, .LBB43_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB43_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB43_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:  .LBB43_4: # %entry
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_store_64_4:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB43_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB43_2: # %entry
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_store_64_4:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 5
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB43_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB43_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB43_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:  .LBB43_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_store_64_4:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB43_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
+; RV64I-SFBILOAD-NEXT:  .LBB43_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr seq_cst, align 8          ; load 64-bit value
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
deleted file mode 100644
index 1ba01ac5225d3..0000000000000
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire.ll
+++ /dev/null
@@ -1,2205 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a | FileCheck %s --check-prefixes=RV32I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-ialu | \
-; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-ialu | \
-; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-iload | \
-; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-iload | \
-; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
-
-define i32 @test_i8_s_3(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i8_s_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB0_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB0_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB0_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB0_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB0_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB0_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_z_3(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i8_z_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB1_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB1_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB1_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB1_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB1_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB1_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_s_3(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i16_s_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB2_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB2_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB2_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB2_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB2_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB2_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_z_3(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i16_z_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB3_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB3_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB3_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB3_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB3_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB3_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i32_3(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i32_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a0, 16(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB4_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB4_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lw a0, 16(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB4_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB4_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB4_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB4_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr acquire, align 4          ; load 32-bit value
-  %res = select i1 %x, i32 %val, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_s_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i8_s_store_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB5_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB5_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_store_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB5_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB5_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_store_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB5_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_store_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB5_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_store_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_store_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_z_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i8_z_store_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB6_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB6_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_store_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB6_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB6_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_store_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB6_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_store_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB6_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_store_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_store_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_s_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i16_s_store_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB7_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB7_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_store_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB7_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB7_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_store_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB7_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB7_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_store_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_store_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_z_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i16_z_store_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB8_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB8_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_store_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB8_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB8_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_store_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB8_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_store_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB8_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_store_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_store_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i32_store_3(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i32_store_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a0, 16(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB9_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB9_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_store_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lw a0, 16(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB9_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB9_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_store_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB9_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_store_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB9_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_store_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_store_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr acquire, align 4          ; load 32-bit value
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %val, i32 %b
-  ret i32 %res
-}
-
-define i64 @test_i8_s_1_3(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i8_s_1_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB10_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB10_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_1_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB10_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB10_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_1_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB10_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB10_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB10_4: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_1_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB10_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_1_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_1_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB10_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_z_1_3(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i8_z_1_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB11_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB11_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_1_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB11_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB11_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_1_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB11_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB11_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB11_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB11_4: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_1_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB11_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_1_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_1_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_s_1_3(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i16_s_1_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB12_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_1_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB12_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB12_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_1_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB12_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB12_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB12_4: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_1_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB12_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_1_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_1_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB12_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_z_1_3(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i16_z_1_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB13_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB13_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_1_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB13_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB13_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_1_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB13_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB13_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB13_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB13_4: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_1_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB13_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_1_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_1_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i32_z_1_3(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i32_z_1_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    addi a2, sp, 12
-; RV32I-NEXT:    li a3, 2
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    beqz s2, .LBB14_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lw s1, 12(sp)
-; RV32I-NEXT:  .LBB14_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_z_1_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 2
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    beqz s1, .LBB14_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lwu s0, 4(sp)
-; RV64I-NEXT:  .LBB14_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_z_1_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    addi a2, sp, 12
-; RV32I-SFB-NEXT:    li a3, 2
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 12(sp)
-; RV32I-SFB-NEXT:    bnez s2, .LBB14_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:  .LBB14_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB14_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
-; RV32I-SFB-NEXT:  .LBB14_4: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_z_1_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 2
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s1, .LBB14_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:  .LBB14_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_z_1_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
-; RV32I-SFBILOAD-NEXT:    li a3, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
-; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
-; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_z_1_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB14_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
-; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr acquire, align 2          ; load 32-bit value
-  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i64_1_3(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i64_1_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 32
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_8
-; RV32I-NEXT:    bnez s2, .LBB15_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:  .LBB15_2: # %entry
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i64_1_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    ld a0, 32(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB15_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB15_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i64_1_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 32
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_8
-; RV32I-SFB-NEXT:    bnez s2, .LBB15_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:  .LBB15_2: # %entry
-; RV32I-SFB-NEXT:    bnez s2, .LBB15_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:  .LBB15_4: # %entry
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i64_1_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB15_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i64_1_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i64_1_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i64, ptr %addr acquire, align 8          ; load 64-bit value
-  %res = select i1 %x, i64 %val, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_s_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i8_s_store_64_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB16_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB16_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_store_64_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB16_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB16_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_store_64_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB16_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB16_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB16_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_store_64_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB16_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i8_z_store_64_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB17_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB17_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_store_64_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB17_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB17_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_store_64_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB17_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB17_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB17_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_store_64_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB17_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB17_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr acquire, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_s_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i16_s_store_64_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB18_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB18_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_store_64_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB18_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB18_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_store_64_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB18_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB18_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB18_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_64_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB18_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i16_z_store_64_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB19_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB19_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_store_64_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB19_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB19_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_store_64_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB19_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB19_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_store_64_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB19_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr acquire, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i32_z_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i32_z_store_64_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    mv a2, sp
-; RV32I-NEXT:    li a3, 2
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    lw a0, 0(sp)
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    bnez s5, .LBB20_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:  .LBB20_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_z_store_64_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 2
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    lwu a0, 4(sp)
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB20_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:  .LBB20_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_z_store_64_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    mv a2, sp
-; RV32I-SFB-NEXT:    li a3, 2
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 0(sp)
-; RV32I-SFB-NEXT:    beqz s5, .LBB20_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
-; RV32I-SFB-NEXT:  .LBB20_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB20_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:  .LBB20_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_z_store_64_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 2
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s3, .LBB20_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:  .LBB20_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    mv a2, sp
-; RV32I-SFBILOAD-NEXT:    li a3, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB20_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB20_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB20_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr acquire, align 2          ; load 32-bit value
-  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i64_store_64_3(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i64_store_64_3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 32
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    call __atomic_load_8
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    bnez s5, .LBB21_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:  .LBB21_2: # %entry
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i64_store_64_3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    ld a0, 32(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB21_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB21_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i64_store_64_3:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 32
-; RV32I-SFB-NEXT:    li a1, 2
-; RV32I-SFB-NEXT:    call __atomic_load_8
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    bnez s5, .LBB21_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:  .LBB21_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB21_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:  .LBB21_4: # %entry
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i64_store_64_3:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB21_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i64_store_64_3:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV32I-SFBILOAD-NEXT:    li a1, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:  .LBB21_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i64_store_64_3:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i64, ptr %addr acquire, align 8          ; load 64-bit value
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %val, i64 %b
-  ret i64 %res
-}
-
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
deleted file mode 100644
index 9d3606dca49a8..0000000000000
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-monotonic.ll
+++ /dev/null
@@ -1,2091 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a | FileCheck %s --check-prefixes=RV32I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-ialu | \
-; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-ialu | \
-; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-iload | \
-; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-iload | \
-; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
-
-define i32 @test_i8_s_2(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i8_s_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB0_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB0_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB0_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB0_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB0_2: # %entry
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB0_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_z_2(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i8_z_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB1_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB1_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB1_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB1_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB1_2: # %entry
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB1_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_s_2(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i16_s_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB2_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB2_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB2_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB2_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB2_2: # %entry
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB2_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_z_2(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i16_z_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB3_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB3_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB3_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB3_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB3_2: # %entry
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB3_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i32_2(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i32_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a0, 16(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB4_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB4_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lw a0, 16(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB4_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB4_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB4_2: # %entry
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB4_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr monotonic, align 4          ; load 32-bit value
-  %res = select i1 %x, i32 %val, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_s_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i8_s_store_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB5_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB5_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_store_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB5_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB5_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_store_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB5_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_store_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB5_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_store_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_store_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_z_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i8_z_store_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB6_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB6_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_store_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB6_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB6_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_store_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB6_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_store_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB6_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_store_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_store_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_s_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i16_s_store_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB7_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB7_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_store_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB7_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB7_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_store_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB7_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB7_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_store_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_store_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_z_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i16_z_store_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB8_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB8_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_store_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB8_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB8_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_store_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB8_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_store_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB8_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_store_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_store_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i32_store_2(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i32_store_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a0, 16(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB9_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB9_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_store_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lw a0, 16(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB9_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB9_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_store_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB9_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_store_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB9_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_store_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_store_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr monotonic, align 4          ; load 32-bit value
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %val, i32 %b
-  ret i32 %res
-}
-
-define i64 @test_i8_s_1_2(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i8_s_1_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB10_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB10_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_1_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB10_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB10_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_1_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB10_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB10_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB10_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_1_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB10_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_1_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_1_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB10_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_z_1_2(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i8_z_1_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB11_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB11_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_1_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB11_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB11_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_1_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB11_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB11_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB11_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB11_4: # %entry
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_1_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB11_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_1_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_1_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_s_1_2(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i16_s_1_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB12_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_1_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB12_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB12_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_1_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB12_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB12_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB12_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_1_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB12_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_1_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_1_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB12_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_z_1_2(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i16_z_1_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB13_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB13_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_1_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB13_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB13_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_1_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB13_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB13_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB13_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB13_4: # %entry
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_1_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB13_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_1_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_1_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i32_z_1_2(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i32_z_1_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    addi a2, sp, 12
-; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    beqz s2, .LBB14_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lw s1, 12(sp)
-; RV32I-NEXT:  .LBB14_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_z_1_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 0
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    beqz s1, .LBB14_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lwu s0, 4(sp)
-; RV64I-NEXT:  .LBB14_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_z_1_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    addi a2, sp, 12
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 12(sp)
-; RV32I-SFB-NEXT:    bnez s2, .LBB14_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:  .LBB14_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB14_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
-; RV32I-SFB-NEXT:  .LBB14_4: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_z_1_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 0
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s1, .LBB14_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:  .LBB14_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_z_1_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
-; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
-; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_z_1_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB14_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
-; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr monotonic, align 2          ; load 32-bit value
-  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i64_1_2(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i64_1_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 32
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_8
-; RV32I-NEXT:    bnez s2, .LBB15_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:  .LBB15_2: # %entry
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i64_1_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    ld a0, 32(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB15_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB15_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i64_1_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 32
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_8
-; RV32I-SFB-NEXT:    bnez s2, .LBB15_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:  .LBB15_2: # %entry
-; RV32I-SFB-NEXT:    bnez s2, .LBB15_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:  .LBB15_4: # %entry
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i64_1_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB15_2: # %entry
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i64_1_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i64_1_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i64, ptr %addr monotonic, align 8          ; load 64-bit value
-  %res = select i1 %x, i64 %val, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_s_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i8_s_store_64_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB16_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB16_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_store_64_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB16_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB16_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_store_64_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB16_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB16_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB16_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_store_64_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB16_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i8_z_store_64_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB17_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB17_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_store_64_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB17_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB17_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_store_64_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB17_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB17_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB17_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_store_64_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB17_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB17_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_s_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i16_s_store_64_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB18_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB18_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_store_64_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB18_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB18_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_store_64_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB18_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB18_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB18_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_64_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB18_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i16_z_store_64_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB19_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB19_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_store_64_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB19_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB19_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_store_64_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB19_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB19_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_store_64_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB19_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i32_z_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i32_z_store_64_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    mv a2, sp
-; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    lw a0, 0(sp)
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    bnez s5, .LBB20_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:  .LBB20_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_z_store_64_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 0
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    lwu a0, 4(sp)
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB20_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:  .LBB20_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_z_store_64_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    mv a2, sp
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 0(sp)
-; RV32I-SFB-NEXT:    beqz s5, .LBB20_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
-; RV32I-SFB-NEXT:  .LBB20_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB20_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:  .LBB20_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_z_store_64_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 0
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s3, .LBB20_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:  .LBB20_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    mv a2, sp
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB20_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB20_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB20_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr monotonic, align 2          ; load 32-bit value
-  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i64_store_64_2(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i64_store_64_2:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 32
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_8
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    bnez s5, .LBB21_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:  .LBB21_2: # %entry
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i64_store_64_2:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    ld a0, 32(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB21_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB21_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i64_store_64_2:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 32
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_8
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    bnez s5, .LBB21_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:  .LBB21_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB21_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:  .LBB21_4: # %entry
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i64_store_64_2:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB21_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i64_store_64_2:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:  .LBB21_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i64_store_64_2:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i64, ptr %addr monotonic, align 8          ; load 64-bit value
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %val, i64 %b
-  ret i64 %res
-}
-
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
deleted file mode 100644
index f4aa40185ed9c..0000000000000
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-seq_cst.ll
+++ /dev/null
@@ -1,2319 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a | FileCheck %s --check-prefixes=RV32I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-ialu | \
-; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-ialu | \
-; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-iload | \
-; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-iload | \
-; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
-
-define i32 @test_i8_s_4(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i8_s_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB0_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB0_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB0_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB0_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB0_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB0_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB0_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_z_4(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i8_z_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB1_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB1_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB1_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB1_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB1_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB1_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB1_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_s_4(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i16_s_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB2_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB2_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB2_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB2_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB2_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB2_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB2_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_z_4(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i16_z_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB3_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB3_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB3_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB3_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB3_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB3_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB3_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i32_4(ptr %base, i1 %x, i32 %b) nounwind {
-; RV32I-LABEL: test_i32_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lw a0, 16(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB4_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB4_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lw a0, 16(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB4_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB4_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB4_2: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB4_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB4_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr seq_cst, align 4          ; load 32-bit value
-  %res = select i1 %x, i32 %val, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_s_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i8_s_store_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB5_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB5_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_store_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB5_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB5_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_store_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB5_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_store_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB5_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_store_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_store_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i8_z_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i8_z_store_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB6_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB6_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_store_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB6_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB6_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_store_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB6_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_store_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB6_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_store_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_store_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_s_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i16_s_store_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB7_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB7_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_store_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB7_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB7_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_store_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB7_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB7_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_store_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_store_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i16_z_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i16_z_store_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB8_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB8_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_store_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB8_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB8_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_store_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB8_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_store_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB8_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_store_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_store_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  ret i32 %res
-}
-
-define i32 @test_i32_store_4(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
-; RV32I-LABEL: test_i32_store_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lw a0, 16(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a4, 0(a3)
-; RV32I-NEXT:    bnez a1, .LBB9_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB9_2: # %entry
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_store_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lw a0, 16(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sw a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB9_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB9_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_store_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB9_2: # %entry
-; RV32I-SFB-NEXT:    sw a4, 0(a3)
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_store_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB9_2: # %entry
-; RV64I-SFB-NEXT:    sw a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_store_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_store_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr seq_cst, align 4          ; load 32-bit value
-  store i32 %c, ptr %base1
-  %res = select i1 %x, i32 %val, i32 %b
-  ret i32 %res
-}
-
-define i64 @test_i8_s_1_4(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i8_s_1_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB10_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB10_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_1_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB10_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB10_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_1_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB10_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB10_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB10_4: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_1_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB10_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_1_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_1_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB10_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_z_1_4(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i8_z_1_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB11_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB11_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_1_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB11_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB11_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_1_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB11_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB11_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB11_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB11_4: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_1_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB11_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_1_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_1_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB11_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_s_1_4(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i16_s_1_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB12_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_1_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB12_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB12_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_1_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB12_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB12_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB12_4: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_1_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB12_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_1_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB12_4: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_1_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB12_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB12_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_z_1_4(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i16_z_1_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    bnez a1, .LBB13_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB13_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_1_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB13_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB13_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_1_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB13_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB13_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB13_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB13_4: # %entry
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_1_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB13_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_1_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_1_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB13_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i32_z_1_4(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i32_z_1_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    addi a2, sp, 12
-; RV32I-NEXT:    li a3, 5
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    beqz s2, .LBB14_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lw s1, 12(sp)
-; RV32I-NEXT:  .LBB14_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_z_1_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s1, a1, 1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 5
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    beqz s1, .LBB14_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lwu s0, 4(sp)
-; RV64I-NEXT:  .LBB14_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_z_1_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    addi a2, sp, 12
-; RV32I-SFB-NEXT:    li a3, 5
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 12(sp)
-; RV32I-SFB-NEXT:    bnez s2, .LBB14_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:  .LBB14_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB14_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
-; RV32I-SFB-NEXT:  .LBB14_4: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_z_1_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    andi s1, a1, 1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 5
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s1, .LBB14_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:  .LBB14_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_z_1_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
-; RV32I-SFBILOAD-NEXT:    li a3, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
-; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
-; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_z_1_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    andi s1, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB14_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
-; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr seq_cst, align 2          ; load 32-bit value
-  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i64_1_4(ptr %base, i1 %x, i64 %b) nounwind {
-; RV32I-LABEL: test_i64_1_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s2, a1, 1
-; RV32I-NEXT:    addi a0, a0, 32
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_8
-; RV32I-NEXT:    bnez s2, .LBB15_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:  .LBB15_2: # %entry
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i64_1_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    ld a0, 32(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    bnez a1, .LBB15_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB15_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i64_1_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    andi s2, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 32
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_8
-; RV32I-SFB-NEXT:    bnez s2, .LBB15_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:  .LBB15_2: # %entry
-; RV32I-SFB-NEXT:    bnez s2, .LBB15_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:  .LBB15_4: # %entry
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i64_1_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB15_2: # %entry
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i64_1_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    andi s2, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB15_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:  .LBB15_4: # %entry
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i64_1_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB15_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB15_2: # %entry
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i64, ptr %addr seq_cst, align 8          ; load 64-bit value
-  %res = select i1 %x, i64 %val, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_s_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i8_s_store_64_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB16_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB16_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_store_64_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB16_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB16_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_store_64_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB16_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB16_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB16_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_store_64_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB16_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB16_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
-  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i8_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i8_z_store_64_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB17_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB17_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_store_64_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB17_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB17_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_store_64_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB17_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB17_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB17_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_store_64_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB17_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB17_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB17_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB17_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i8, ptr %addr seq_cst, align 1          ; load 8-bit value
-  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_s_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i16_s_store_64_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB18_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    mv a1, a3
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB18_2:
-; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_store_64_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB18_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB18_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_store_64_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB18_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB18_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB18_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_store_64_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB18_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB18_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
-  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i16_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i16_z_store_64_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    fence rw, rw
-; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    fence r, rw
-; RV32I-NEXT:    sw a5, 0(a4)
-; RV32I-NEXT:    sw a6, 4(a4)
-; RV32I-NEXT:    bnez a1, .LBB19_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:  .LBB19_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_store_64_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB19_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB19_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_store_64_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    fence rw, rw
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    fence r, rw
-; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:  .LBB19_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB19_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB19_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
-; RV32I-SFB-NEXT:    sw a6, 4(a4)
-; RV32I-SFB-NEXT:    mv a1, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_store_64_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB19_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    fence rw, rw
-; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    fence r, rw
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB19_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, a2
-; RV32I-SFBILOAD-NEXT:  .LBB19_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
-; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB19_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i16, ptr %addr seq_cst, align 2          ; load 16-bit value
-  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i32_z_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i32_z_store_64_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    mv a2, sp
-; RV32I-NEXT:    li a3, 5
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    lw a0, 0(sp)
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    bnez s5, .LBB20_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:  .LBB20_2: # %entry
-; RV32I-NEXT:    addi a1, s5, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_z_store_64_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    andi s3, a1, 1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 5
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    lwu a0, 4(sp)
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB20_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:  .LBB20_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_z_store_64_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    mv a2, sp
-; RV32I-SFB-NEXT:    li a3, 5
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 0(sp)
-; RV32I-SFB-NEXT:    beqz s5, .LBB20_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
-; RV32I-SFB-NEXT:  .LBB20_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB20_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:  .LBB20_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_z_store_64_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    andi s3, a1, 1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 5
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s3, .LBB20_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:  .LBB20_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    mv a2, sp
-; RV32I-SFBILOAD-NEXT:    li a3, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB20_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
-; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB20_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    andi s3, a1, 1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB20_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr seq_cst, align 2          ; load 32-bit value
-  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %ext, i64 %b
-  ret i64 %res
-}
-
-define i64 @test_i64_store_64_4(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
-; RV32I-LABEL: test_i64_store_64_4:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    andi s5, a1, 1
-; RV32I-NEXT:    addi a0, a0, 32
-; RV32I-NEXT:    li a1, 5
-; RV32I-NEXT:    call __atomic_load_8
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    bnez s5, .LBB21_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:  .LBB21_2: # %entry
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i64_store_64_4:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    fence rw, rw
-; RV64I-NEXT:    ld a0, 32(a0)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    fence r, rw
-; RV64I-NEXT:    sd a4, 0(a3)
-; RV64I-NEXT:    bnez a1, .LBB21_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:  .LBB21_2: # %entry
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i64_store_64_4:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    andi s5, a1, 1
-; RV32I-SFB-NEXT:    addi a0, a0, 32
-; RV32I-SFB-NEXT:    li a1, 5
-; RV32I-SFB-NEXT:    call __atomic_load_8
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    bnez s5, .LBB21_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:  .LBB21_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB21_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:  .LBB21_4: # %entry
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i64_store_64_4:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    fence rw, rw
-; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    fence r, rw
-; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB21_2: # %entry
-; RV64I-SFB-NEXT:    sd a4, 0(a3)
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i64_store_64_4:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    andi s5, a1, 1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV32I-SFBILOAD-NEXT:    li a1, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB21_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:  .LBB21_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i64_store_64_4:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    fence rw, rw
-; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    fence r, rw
-; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, a2
-; RV64I-SFBILOAD-NEXT:  .LBB21_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i64, ptr %addr seq_cst, align 8          ; load 64-bit value
-  store i64 %c, ptr %base1
-  %res = select i1 %x, i64 %val, i64 %b
-  ret i64 %res
-}
-
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
deleted file mode 100644
index 90899c690516a..0000000000000
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-volatile.ll
+++ /dev/null
@@ -1,1022 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
-; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
-; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
-; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
-; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
-
-define i32 @test_i8_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i8_s_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a4, 4(a0)
-; RV32I-NEXT:    lw a0, 0(a3)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB0_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a4, a2
-; RV32I-NEXT:  .LBB0_2: # %entry
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a4, 4(a0)
-; RV64I-NEXT:    lw a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB0_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB0_2: # %entry
-; RV64I-NEXT:    addw a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    lw a3, 0(a3)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB0_2: # %entry
-; RV32I-SFB-NEXT:    add a0, a0, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    lw a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB0_2: # %entry
-; RV64I-SFB-NEXT:    addw a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB0_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a2, 4(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB0_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB0_2: # %entry
-; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load i8, ptr %addr          ; load 8-bit value
-  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
-  %val1 = load volatile i32, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  %res1 = add i32 %res, %val1
-  ret i32 %res1
-}
-
-define i32 @test_i8_z_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i8_z_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lw a0, 0(a3)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB1_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a4, a2
-; RV32I-NEXT:  .LBB1_2: # %entry
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a4, 4(a0)
-; RV64I-NEXT:    lw a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB1_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB1_2: # %entry
-; RV64I-NEXT:    addw a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    lw a3, 0(a3)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB1_2: # %entry
-; RV32I-SFB-NEXT:    add a0, a0, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    lw a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB1_2: # %entry
-; RV64I-SFB-NEXT:    addw a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB1_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB1_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB1_2: # %entry
-; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
-  %val = load i8, ptr %addr          ; load 8-bit value
-  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
-  %val1 = load volatile i32, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  %res1 = add i32 %res, %val1
-  ret i32 %res1
-}
-
-define i32 @test_i16_s_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i16_s_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a4, 8(a0)
-; RV32I-NEXT:    lw a0, 0(a3)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB2_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a4, a2
-; RV32I-NEXT:  .LBB2_2: # %entry
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a4, 8(a0)
-; RV64I-NEXT:    lw a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB2_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB2_2: # %entry
-; RV64I-NEXT:    addw a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    lw a3, 0(a3)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB2_2: # %entry
-; RV32I-SFB-NEXT:    add a0, a0, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    lw a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB2_2: # %entry
-; RV64I-SFB-NEXT:    addw a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB2_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a2, 8(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB2_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB2_2: # %entry
-; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load i16, ptr %addr          ; load 16-bit value
-  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
-  %val1 = load volatile i32, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  %res1 = add i32 %res, %val1
-  ret i32 %res1
-}
-
-define i32 @test_i16_z_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i16_z_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a4, 8(a0)
-; RV32I-NEXT:    lw a0, 0(a3)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB3_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a4, a2
-; RV32I-NEXT:  .LBB3_2: # %entry
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a4, 8(a0)
-; RV64I-NEXT:    lw a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB3_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB3_2: # %entry
-; RV64I-NEXT:    addw a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    lw a3, 0(a3)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB3_2: # %entry
-; RV32I-SFB-NEXT:    add a0, a0, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    lw a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB3_2: # %entry
-; RV64I-SFB-NEXT:    addw a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB3_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB3_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB3_2: # %entry
-; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
-  %val = load i16, ptr %addr          ; load 16-bit value
-  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
-  %val1 = load volatile i32, ptr %base1
-  %res = select i1 %x, i32 %ext, i32 %b
-  %res1 = add i32 %res, %val1
-  ret i32 %res1
-}
-
-define i32 @test_i32_volatile(ptr %base, i1 %x, i32 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i32_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a4, 16(a0)
-; RV32I-NEXT:    lw a0, 0(a3)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB4_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a4, a2
-; RV32I-NEXT:  .LBB4_2: # %entry
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lw a4, 16(a0)
-; RV64I-NEXT:    lw a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB4_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB4_2: # %entry
-; RV64I-NEXT:    addw a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    lw a3, 0(a3)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
-; RV32I-SFB-NEXT:  .LBB4_2: # %entry
-; RV32I-SFB-NEXT:    add a0, a0, a3
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    lw a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB4_2: # %entry
-; RV64I-SFB-NEXT:    addw a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB4_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB4_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lw a2, 16(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB4_2: # %entry
-; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
-  %val = load i32, ptr %addr          ; load 32-bit value
-  %val1 = load volatile i32, ptr %base1
-  %res = select i1 %x, i32 %val, i32 %b
-  %res1 = add i32 %res, %val1
-  ret i32 %res1
-}
-
-
-define i64 @test_i8_s_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i8_s_1_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lb a6, 4(a0)
-; RV32I-NEXT:    lw a5, 4(a4)
-; RV32I-NEXT:    lw a0, 0(a4)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB5_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a6, a2
-; RV32I-NEXT:    j .LBB5_3
-; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    srai a3, a6, 31
-; RV32I-NEXT:  .LBB5_3: # %entry
-; RV32I-NEXT:    add a0, a6, a0
-; RV32I-NEXT:    sltu a1, a0, a6
-; RV32I-NEXT:    add a3, a3, a5
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_s_1_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lb a4, 4(a0)
-; RV64I-NEXT:    ld a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB5_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB5_2: # %entry
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_s_1_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    lw a5, 4(a4)
-; RV32I-SFB-NEXT:    lw a4, 0(a4)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB5_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB5_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB5_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB5_4: # %entry
-; RV32I-SFB-NEXT:    add a0, a2, a4
-; RV32I-SFB-NEXT:    sltu a1, a0, a2
-; RV32I-SFB-NEXT:    add a3, a3, a5
-; RV32I-SFB-NEXT:    add a1, a3, a1
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_s_1_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    ld a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB5_2: # %entry
-; RV64I-SFB-NEXT:    add a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_s_1_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
-; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB5_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB5_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB5_4: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
-; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
-; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
-; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_s_1_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB5_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB5_2: # %entry
-; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load i8, ptr %addr          ; load 8-bit value
-  %val1 = load volatile i64, ptr %base1
-  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  %res1 = add i64 %res, %val1
-  ret i64 %res1
-}
-
-define i64 @test_i8_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i8_z_1_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lbu a6, 4(a0)
-; RV32I-NEXT:    lw a5, 4(a4)
-; RV32I-NEXT:    lw a0, 0(a4)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB6_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a6, a2
-; RV32I-NEXT:  .LBB6_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    add a0, a6, a0
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    sltu a2, a0, a6
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i8_z_1_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lbu a4, 4(a0)
-; RV64I-NEXT:    ld a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB6_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB6_2: # %entry
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i8_z_1_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lbu a5, 4(a0)
-; RV32I-SFB-NEXT:    lw a6, 4(a4)
-; RV32I-SFB-NEXT:    lw a0, 0(a4)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a5, a2
-; RV32I-SFB-NEXT:  .LBB6_2: # %entry
-; RV32I-SFB-NEXT:    add a0, a5, a0
-; RV32I-SFB-NEXT:    sltu a2, a0, a5
-; RV32I-SFB-NEXT:    bnez a1, .LBB6_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    add a6, a6, a3
-; RV32I-SFB-NEXT:  .LBB6_4: # %entry
-; RV32I-SFB-NEXT:    add a1, a6, a2
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i8_z_1_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    ld a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB6_2: # %entry
-; RV64I-SFB-NEXT:    add a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i8_z_1_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
-; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB6_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
-; RV32I-SFBILOAD-NEXT:    sltu a2, a0, a2
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
-; RV32I-SFBILOAD-NEXT:  .LBB6_4: # %entry
-; RV32I-SFBILOAD-NEXT:    add a1, a5, a2
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i8_z_1_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB6_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB6_2: # %entry
-; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
-  %val = load i8, ptr %addr          ; load 8-bit value
-  %val1 = load volatile i64, ptr %base1
-  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  %res1 = add i64 %res, %val1
-  ret i64 %res1
-}
-
-define i64 @test_i16_s_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i16_s_1_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lh a6, 8(a0)
-; RV32I-NEXT:    lw a5, 4(a4)
-; RV32I-NEXT:    lw a0, 0(a4)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB7_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a6, a2
-; RV32I-NEXT:    j .LBB7_3
-; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    srai a3, a6, 31
-; RV32I-NEXT:  .LBB7_3: # %entry
-; RV32I-NEXT:    add a0, a6, a0
-; RV32I-NEXT:    sltu a1, a0, a6
-; RV32I-NEXT:    add a3, a3, a5
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_s_1_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lh a4, 8(a0)
-; RV64I-NEXT:    ld a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB7_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB7_2: # %entry
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_s_1_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    lw a5, 4(a4)
-; RV32I-SFB-NEXT:    lw a4, 0(a4)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    beqz a1, .LBB7_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai a3, a0, 31
-; RV32I-SFB-NEXT:  .LBB7_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB7_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a2, a0
-; RV32I-SFB-NEXT:  .LBB7_4: # %entry
-; RV32I-SFB-NEXT:    add a0, a2, a4
-; RV32I-SFB-NEXT:    sltu a1, a0, a2
-; RV32I-SFB-NEXT:    add a3, a3, a5
-; RV32I-SFB-NEXT:    add a1, a3, a1
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_s_1_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    ld a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB7_2: # %entry
-; RV64I-SFB-NEXT:    add a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_s_1_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
-; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB7_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
-; RV32I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB7_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a2, a0
-; RV32I-SFBILOAD-NEXT:  .LBB7_4: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
-; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
-; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
-; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_s_1_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB7_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB7_2: # %entry
-; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load i16, ptr %addr          ; load 16-bit value
-  %val1 = load volatile i64, ptr %base1
-  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  %res1 = add i64 %res, %val1
-  ret i64 %res1
-}
-
-define i64 @test_i16_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i16_z_1_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lhu a6, 8(a0)
-; RV32I-NEXT:    lw a5, 4(a4)
-; RV32I-NEXT:    lw a0, 0(a4)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB8_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a6, a2
-; RV32I-NEXT:  .LBB8_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    add a0, a6, a0
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    sltu a2, a0, a6
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i16_z_1_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lhu a4, 8(a0)
-; RV64I-NEXT:    ld a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB8_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB8_2: # %entry
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i16_z_1_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lhu a5, 8(a0)
-; RV32I-SFB-NEXT:    lw a6, 4(a4)
-; RV32I-SFB-NEXT:    lw a0, 0(a4)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a5, a2
-; RV32I-SFB-NEXT:  .LBB8_2: # %entry
-; RV32I-SFB-NEXT:    add a0, a5, a0
-; RV32I-SFB-NEXT:    sltu a2, a0, a5
-; RV32I-SFB-NEXT:    bnez a1, .LBB8_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    add a6, a6, a3
-; RV32I-SFB-NEXT:  .LBB8_4: # %entry
-; RV32I-SFB-NEXT:    add a1, a6, a2
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i16_z_1_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    ld a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB8_2: # %entry
-; RV64I-SFB-NEXT:    add a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i16_z_1_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
-; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB8_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
-; RV32I-SFBILOAD-NEXT:    sltu a2, a0, a2
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
-; RV32I-SFBILOAD-NEXT:  .LBB8_4: # %entry
-; RV32I-SFBILOAD-NEXT:    add a1, a5, a2
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i16_z_1_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB8_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB8_2: # %entry
-; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
-  %val = load i16, ptr %addr          ; load 16-bit value
-  %val1 = load volatile i64, ptr %base1
-  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  %res1 = add i64 %res, %val1
-  ret i64 %res1
-}
-
-define i64 @test_i32_z_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i32_z_1_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a6, 16(a0)
-; RV32I-NEXT:    lw a5, 4(a4)
-; RV32I-NEXT:    lw a0, 0(a4)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB9_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a6, a2
-; RV32I-NEXT:  .LBB9_2: # %entry
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    add a0, a6, a0
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    sltu a2, a0, a6
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i32_z_1_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lwu a4, 16(a0)
-; RV64I-NEXT:    ld a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB9_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB9_2: # %entry
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i32_z_1_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lw a5, 16(a0)
-; RV32I-SFB-NEXT:    lw a6, 4(a4)
-; RV32I-SFB-NEXT:    lw a0, 0(a4)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a5, a2
-; RV32I-SFB-NEXT:  .LBB9_2: # %entry
-; RV32I-SFB-NEXT:    add a0, a5, a0
-; RV32I-SFB-NEXT:    sltu a2, a0, a5
-; RV32I-SFB-NEXT:    bnez a1, .LBB9_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    add a6, a6, a3
-; RV32I-SFB-NEXT:  .LBB9_4: # %entry
-; RV32I-SFB-NEXT:    add a1, a6, a2
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i32_z_1_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    lwu a0, 16(a0)
-; RV64I-SFB-NEXT:    ld a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB9_2: # %entry
-; RV64I-SFB-NEXT:    add a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i32_z_1_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
-; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB9_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
-; RV32I-SFBILOAD-NEXT:    sltu a2, a0, a2
-; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
-; RV32I-SFBILOAD-NEXT:  .LBB9_4: # %entry
-; RV32I-SFBILOAD-NEXT:    add a1, a5, a2
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i32_z_1_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB9_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lwu a2, 16(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB9_2: # %entry
-; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load i32, ptr %addr          ; load 32-bit value
-  %val1 = load volatile i64, ptr %base1
-  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
-  %res = select i1 %x, i64 %ext, i64 %b
-  %res1 = add i64 %res, %val1
-  ret i64 %res1
-}
-
-define i64 @test_i64_1_volatile(ptr %base, i1 %x, i64 %b, ptr %base1) nounwind {
-; RV32I-LABEL: test_i64_1_volatile:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a7, 32(a0)
-; RV32I-NEXT:    lw a6, 36(a0)
-; RV32I-NEXT:    lw a5, 4(a4)
-; RV32I-NEXT:    lw a0, 0(a4)
-; RV32I-NEXT:    andi a1, a1, 1
-; RV32I-NEXT:    bnez a1, .LBB10_2
-; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a6, a3
-; RV32I-NEXT:    mv a7, a2
-; RV32I-NEXT:  .LBB10_2: # %entry
-; RV32I-NEXT:    add a0, a7, a0
-; RV32I-NEXT:    sltu a1, a0, a7
-; RV32I-NEXT:    add a5, a6, a5
-; RV32I-NEXT:    add a1, a5, a1
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: test_i64_1_volatile:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    ld a4, 32(a0)
-; RV64I-NEXT:    ld a0, 0(a3)
-; RV64I-NEXT:    andi a1, a1, 1
-; RV64I-NEXT:    bnez a1, .LBB10_2
-; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a4, a2
-; RV64I-NEXT:  .LBB10_2: # %entry
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    ret
-;
-; RV32I-SFB-LABEL: test_i64_1_volatile:
-; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    lw a5, 32(a0)
-; RV32I-SFB-NEXT:    lw a6, 36(a0)
-; RV32I-SFB-NEXT:    lw a7, 4(a4)
-; RV32I-SFB-NEXT:    lw a0, 0(a4)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB10_2
-; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a6, a3
-; RV32I-SFB-NEXT:  .LBB10_2: # %entry
-; RV32I-SFB-NEXT:    bnez a1, .LBB10_4
-; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a5, a2
-; RV32I-SFB-NEXT:  .LBB10_4: # %entry
-; RV32I-SFB-NEXT:    add a0, a5, a0
-; RV32I-SFB-NEXT:    sltu a1, a0, a5
-; RV32I-SFB-NEXT:    add a6, a6, a7
-; RV32I-SFB-NEXT:    add a1, a6, a1
-; RV32I-SFB-NEXT:    ret
-;
-; RV64I-SFB-LABEL: test_i64_1_volatile:
-; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    ld a3, 0(a3)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
-; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
-; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, a2
-; RV64I-SFB-NEXT:  .LBB10_2: # %entry
-; RV64I-SFB-NEXT:    add a0, a0, a3
-; RV64I-SFB-NEXT:    ret
-;
-; RV32I-SFBILOAD-LABEL: test_i64_1_volatile:
-; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
-; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
-; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a2, 32(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_4
-; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    lw a3, 36(a0)
-; RV32I-SFBILOAD-NEXT:  .LBB10_4: # %entry
-; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
-; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
-; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
-; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
-; RV32I-SFBILOAD-NEXT:    ret
-;
-; RV64I-SFBILOAD-LABEL: test_i64_1_volatile:
-; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
-; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
-; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
-; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    ld a2, 32(a0)
-; RV64I-SFBILOAD-NEXT:  .LBB10_2: # %entry
-; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
-; RV64I-SFBILOAD-NEXT:    ret
-entry:
-  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
-  %val = load i64, ptr %addr          ; load 64-bit value
-  %val1 = load volatile i64, ptr %base1
-  %res = select i1 %x, i64 %val, i64 %b
-  %res1 = add i64 %res, %val1
-  ret i64 %res1
-}
-
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
index 984e101e8a937..faf4dd0c57c7f 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
@@ -10,10 +10,9 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
-define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) nounwind {
+define i32 @test_i8_s(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_s:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lb a2, 4(a0)
@@ -23,7 +22,6 @@ define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i8_s:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB0_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lb a2, 4(a0)
@@ -34,7 +32,6 @@ define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i8_s:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB0_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -44,7 +41,6 @@ define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i8_s:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB0_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -53,7 +49,6 @@ define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB0_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lb a2, 4(a0)
@@ -63,7 +58,6 @@ define i32 @test_i8_s(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB0_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
@@ -78,10 +72,9 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) nounwind {
+define i32 @test_i8_z(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_z:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lbu a2, 4(a0)
@@ -91,7 +84,6 @@ define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i8_z:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lbu a2, 4(a0)
@@ -102,7 +94,6 @@ define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i8_z:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB1_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -112,7 +103,6 @@ define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i8_z:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB1_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -121,7 +111,6 @@ define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB1_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
@@ -131,7 +120,6 @@ define i32 @test_i8_z(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB1_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
@@ -146,10 +134,9 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) nounwind {
+define i32 @test_i16_s(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_s:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB2_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lh a2, 8(a0)
@@ -159,7 +146,6 @@ define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i16_s:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB2_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lh a2, 8(a0)
@@ -170,7 +156,6 @@ define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i16_s:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB2_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -180,7 +165,6 @@ define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i16_s:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB2_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -189,7 +173,6 @@ define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB2_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lh a2, 8(a0)
@@ -199,7 +182,6 @@ define i32 @test_i16_s(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB2_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
@@ -214,10 +196,9 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) nounwind {
+define i32 @test_i16_z(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_z:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lhu a2, 8(a0)
@@ -227,7 +208,6 @@ define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i16_z:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB3_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lhu a2, 8(a0)
@@ -238,7 +218,6 @@ define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i16_z:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB3_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -248,7 +227,6 @@ define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i16_z:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB3_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -257,7 +235,6 @@ define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB3_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
@@ -267,7 +244,6 @@ define i32 @test_i16_z(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB3_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
@@ -282,10 +258,9 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32(ptr %base, i1 %x, i32 %b) nounwind {
+define i32 @test_i32(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i32:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lw a2, 16(a0)
@@ -295,7 +270,6 @@ define i32 @test_i32(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i32:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB4_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lw a2, 16(a0)
@@ -306,7 +280,6 @@ define i32 @test_i32(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i32:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB4_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -316,7 +289,6 @@ define i32 @test_i32(ptr %base, i1 %x, i32 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i32:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB4_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -325,7 +297,6 @@ define i32 @test_i32(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB4_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
@@ -335,7 +306,6 @@ define i32 @test_i32(ptr %base, i1 %x, i32 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB4_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lw a2, 16(a0)
@@ -349,11 +319,10 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
+define i32 @test_i8_s_store(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a4, 0(a3)
 ; RV32I-NEXT:    bnez a1, .LBB5_2
 ; RV32I-NEXT:  # %bb.1: # %entry
@@ -364,7 +333,6 @@ define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV64I-LABEL: test_i8_s_store:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sw a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -375,7 +343,6 @@ define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV32I-SFB-LABEL: test_i8_s_store:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB5_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -386,7 +353,6 @@ define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV64I-SFB-LABEL: test_i8_s_store:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB5_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -397,7 +363,6 @@ define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
@@ -408,7 +373,6 @@ define i32 @test_i8_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB5_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -424,11 +388,10 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
+define i32 @test_i8_z_store(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a4, 0(a3)
 ; RV32I-NEXT:    bnez a1, .LBB6_2
 ; RV32I-NEXT:  # %bb.1: # %entry
@@ -439,7 +402,6 @@ define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV64I-LABEL: test_i8_z_store:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sw a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB6_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -450,7 +412,6 @@ define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV32I-SFB-LABEL: test_i8_z_store:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB6_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -461,7 +422,6 @@ define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV64I-SFB-LABEL: test_i8_z_store:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB6_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -472,7 +432,6 @@ define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
@@ -483,7 +442,6 @@ define i32 @test_i8_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwi
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB6_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -499,11 +457,10 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
+define i32 @test_i16_s_store(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a4, 0(a3)
 ; RV32I-NEXT:    bnez a1, .LBB7_2
 ; RV32I-NEXT:  # %bb.1: # %entry
@@ -514,7 +471,6 @@ define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV64I-LABEL: test_i16_s_store:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sw a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB7_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -525,7 +481,6 @@ define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV32I-SFB-LABEL: test_i16_s_store:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB7_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -536,7 +491,6 @@ define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV64I-SFB-LABEL: test_i16_s_store:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB7_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -547,7 +501,6 @@ define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
@@ -558,7 +511,6 @@ define i32 @test_i16_s_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB7_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -574,11 +526,10 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
+define i32 @test_i16_z_store(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a4, 0(a3)
 ; RV32I-NEXT:    bnez a1, .LBB8_2
 ; RV32I-NEXT:  # %bb.1: # %entry
@@ -589,7 +540,6 @@ define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV64I-LABEL: test_i16_z_store:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sw a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB8_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -600,7 +550,6 @@ define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV32I-SFB-LABEL: test_i16_z_store:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB8_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -611,7 +560,6 @@ define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV64I-SFB-LABEL: test_i16_z_store:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB8_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -622,7 +570,6 @@ define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
@@ -633,7 +580,6 @@ define i32 @test_i16_z_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounw
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB8_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -649,11 +595,10 @@ entry:
   ret i32 %res
 }
 
-define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwind {
+define i32 @test_i32_store(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i32_store:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a0, 16(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a4, 0(a3)
 ; RV32I-NEXT:    bnez a1, .LBB9_2
 ; RV32I-NEXT:  # %bb.1: # %entry
@@ -664,7 +609,6 @@ define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwin
 ; RV64I-LABEL: test_i32_store:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lw a0, 16(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sw a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB9_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -675,7 +619,6 @@ define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwin
 ; RV32I-SFB-LABEL: test_i32_store:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB9_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a0, a2
@@ -686,7 +629,6 @@ define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwin
 ; RV64I-SFB-LABEL: test_i32_store:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lw a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB9_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -697,7 +639,6 @@ define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwin
 ; RV32I-SFBILOAD-LABEL: test_i32_store:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
@@ -708,7 +649,6 @@ define i32 @test_i32_store(ptr %base, i1 %x, i32 %b, ptr %base1, i32 %c) nounwin
 ; RV64I-SFBILOAD-LABEL: test_i32_store:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB9_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -723,10 +663,9 @@ entry:
   ret i32 %res
 }
 
-define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) nounwind {
+define i64 @test_i8_s_1(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_1:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lb a2, 4(a0)
@@ -738,7 +677,6 @@ define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i8_s_1:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB10_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lb a2, 4(a0)
@@ -749,7 +687,6 @@ define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i8_s_1:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    beqz a1, .LBB10_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a2, a0
@@ -765,7 +702,6 @@ define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i8_s_1:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB10_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -775,7 +711,6 @@ define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-SFBILOAD-LABEL: test_i8_s_1:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a2, a0
@@ -790,7 +725,6 @@ define i64 @test_i8_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_1:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB10_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
@@ -805,10 +739,9 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) nounwind {
+define i64 @test_i8_z_1(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_1:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lbu a2, 4(a0)
@@ -820,7 +753,6 @@ define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i8_z_1:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB11_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lbu a2, 4(a0)
@@ -831,14 +763,13 @@ define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i8_z_1:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB11_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB11_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB11_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB11_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB11_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB11_4: # %entry
 ; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
@@ -846,7 +777,6 @@ define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i8_z_1:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB11_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -855,22 +785,20 @@ define i64 @test_i8_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_1:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB11_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
 ; RV32I-SFBILOAD-NEXT:  .LBB11_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB11_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB11_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB11_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_1:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB11_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
@@ -885,10 +813,9 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) nounwind {
+define i64 @test_i16_s_1(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_1:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lh a2, 8(a0)
@@ -900,7 +827,6 @@ define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i16_s_1:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB12_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lh a2, 8(a0)
@@ -911,7 +837,6 @@ define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i16_s_1:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    beqz a1, .LBB12_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a2, a0
@@ -927,7 +852,6 @@ define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i16_s_1:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB12_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -937,7 +861,6 @@ define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-SFBILOAD-LABEL: test_i16_s_1:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a2, a0
@@ -952,7 +875,6 @@ define i64 @test_i16_s_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_1:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB12_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
@@ -967,10 +889,9 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) nounwind {
+define i64 @test_i16_z_1(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_1:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB13_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lhu a2, 8(a0)
@@ -982,7 +903,6 @@ define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i16_z_1:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB13_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lhu a2, 8(a0)
@@ -993,14 +913,13 @@ define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i16_z_1:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB13_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB13_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB13_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB13_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB13_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB13_4: # %entry
 ; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
@@ -1008,7 +927,6 @@ define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i16_z_1:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB13_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1017,22 +935,20 @@ define i64 @test_i16_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_1:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB13_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
 ; RV32I-SFBILOAD-NEXT:  .LBB13_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB13_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB13_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB13_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_1:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB13_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
@@ -1047,10 +963,9 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_1(ptr %base, i1 %x, i64 %b) nounwind {
+define i64 @test_i32_z_1(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i32_z_1:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB14_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lw a2, 16(a0)
@@ -1062,7 +977,6 @@ define i64 @test_i32_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i32_z_1:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB14_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lwu a2, 16(a0)
@@ -1073,14 +987,13 @@ define i64 @test_i32_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-SFB-LABEL: test_i32_z_1:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
-; RV32I-SFB-NEXT:    bnez a1, .LBB14_2
+; RV32I-SFB-NEXT:    beqz a1, .LBB14_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB14_2: # %entry
-; RV32I-SFB-NEXT:    beqz a1, .LBB14_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB14_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB14_4: # %entry
 ; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
@@ -1088,7 +1001,6 @@ define i64 @test_i32_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i32_z_1:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lwu a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB14_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1097,22 +1009,20 @@ define i64 @test_i32_z_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_1:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a4, a1, 1
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB14_2
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB14_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
 ; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a1, a3
-; RV32I-SFBILOAD-NEXT:    beqz a4, .LBB14_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB14_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_1:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB14_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    lwu a2, 16(a0)
@@ -1127,10 +1037,9 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) nounwind {
+define i64 @test_i64_1(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i64_1:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB15_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lw a2, 32(a0)
@@ -1142,7 +1051,6 @@ define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i64_1:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB15_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    ld a2, 32(a0)
@@ -1154,7 +1062,6 @@ define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lw a4, 32(a0)
 ; RV32I-SFB-NEXT:    lw a5, 36(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB15_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a4, a2
@@ -1170,7 +1077,6 @@ define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) nounwind {
 ; RV64I-SFB-LABEL: test_i64_1:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB15_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1179,7 +1085,6 @@ define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_1:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB15_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    lw a2, 32(a0)
@@ -1194,7 +1099,6 @@ define i64 @test_i64_1(ptr %base, i1 %x, i64 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_1:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB15_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    ld a2, 32(a0)
@@ -1208,11 +1112,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
+define i64 @test_i8_s_store_64(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lb a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
 ; RV32I-NEXT:    bnez a1, .LBB16_2
@@ -1227,7 +1130,6 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV64I-LABEL: test_i8_s_store_64:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lb a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB16_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -1238,7 +1140,7 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV32I-SFB-LABEL: test_i8_s_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lb a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a2, a0
@@ -1247,7 +1149,6 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB16_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:    mv a1, a3
@@ -1256,7 +1157,6 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV64I-SFB-LABEL: test_i8_s_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lb a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1267,7 +1167,7 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB16_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a2, a0
@@ -1276,7 +1176,6 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB16_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:    mv a1, a3
@@ -1285,7 +1184,6 @@ define i64 @test_i8_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB16_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -1301,11 +1199,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
+define i64 @test_i8_z_store_64(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lbu a0, 4(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
 ; RV32I-NEXT:    bnez a1, .LBB17_2
@@ -1319,7 +1216,6 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV64I-LABEL: test_i8_z_store_64:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lbu a0, 4(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB17_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -1330,7 +1226,6 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV32I-SFB-LABEL: test_i8_z_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lbu a0, 4(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    li a3, 0
@@ -1347,7 +1242,6 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV64I-SFB-LABEL: test_i8_z_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lbu a0, 4(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1358,7 +1252,6 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB17_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    li a3, 0
@@ -1375,7 +1268,6 @@ define i64 @test_i8_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nou
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB17_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -1391,11 +1283,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
+define i64 @test_i16_s_store_64(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lh a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
 ; RV32I-NEXT:    bnez a1, .LBB18_2
@@ -1410,7 +1301,6 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-LABEL: test_i16_s_store_64:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lh a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB18_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -1421,7 +1311,7 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV32I-SFB-LABEL: test_i16_s_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lh a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a2, a0
@@ -1430,7 +1320,6 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
 ; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB18_4: # %entry
-; RV32I-SFB-NEXT:    sw a5, 0(a4)
 ; RV32I-SFB-NEXT:    sw a6, 4(a4)
 ; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:    mv a1, a3
@@ -1439,7 +1328,6 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-SFB-LABEL: test_i16_s_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lh a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1450,7 +1338,7 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB18_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a2, a0
@@ -1459,7 +1347,6 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
 ; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB18_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
 ; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
 ; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:    mv a1, a3
@@ -1468,7 +1355,6 @@ define i64 @test_i16_s_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB18_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -1484,11 +1370,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
+define i64 @test_i16_z_store_64(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lhu a0, 8(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
 ; RV32I-NEXT:    bnez a1, .LBB19_2
@@ -1502,7 +1387,6 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-LABEL: test_i16_z_store_64:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lhu a0, 8(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB19_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -1513,7 +1397,6 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV32I-SFB-LABEL: test_i16_z_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lhu a0, 8(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    li a3, 0
@@ -1530,7 +1413,6 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-SFB-LABEL: test_i16_z_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lhu a0, 8(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1541,7 +1423,6 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB19_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    li a3, 0
@@ -1558,7 +1439,6 @@ define i64 @test_i16_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB19_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -1574,11 +1454,10 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
+define i64 @test_i32_z_store_64(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i32_z_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a0, 16(a0)
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
 ; RV32I-NEXT:    bnez a1, .LBB20_2
@@ -1592,7 +1471,6 @@ define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-LABEL: test_i32_z_store_64:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lwu a0, 16(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB20_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -1603,7 +1481,6 @@ define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV32I-SFB-LABEL: test_i32_z_store_64:
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lw a0, 16(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    beqz a1, .LBB20_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    li a3, 0
@@ -1620,7 +1497,6 @@ define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-SFB-LABEL: test_i32_z_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    lwu a0, 16(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB20_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1631,7 +1507,6 @@ define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV32I-SFBILOAD-LABEL: test_i32_z_store_64:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB20_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    li a3, 0
@@ -1648,7 +1523,6 @@ define i64 @test_i32_z_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) no
 ; RV64I-SFBILOAD-LABEL: test_i32_z_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    lwu a0, 16(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB20_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -1664,14 +1538,13 @@ entry:
   ret i64 %res
 }
 
-define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) nounwind {
+define i64 @test_i64_store_64(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i64_store_64:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    mv a7, a1
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    lw a0, 32(a0)
 ; RV32I-NEXT:    lw a1, 36(a1)
-; RV32I-NEXT:    andi a7, a7, 1
 ; RV32I-NEXT:    sw a5, 0(a4)
 ; RV32I-NEXT:    sw a6, 4(a4)
 ; RV32I-NEXT:    bnez a7, .LBB21_2
@@ -1684,7 +1557,6 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) noun
 ; RV64I-LABEL: test_i64_store_64:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    ld a0, 32(a0)
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    sd a4, 0(a3)
 ; RV64I-NEXT:    bnez a1, .LBB21_2
 ; RV64I-NEXT:  # %bb.1: # %entry
@@ -1696,7 +1568,6 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) noun
 ; RV32I-SFB:       # %bb.0: # %entry
 ; RV32I-SFB-NEXT:    lw a7, 32(a0)
 ; RV32I-SFB-NEXT:    lw t0, 36(a0)
-; RV32I-SFB-NEXT:    andi a1, a1, 1
 ; RV32I-SFB-NEXT:    bnez a1, .LBB21_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
 ; RV32I-SFB-NEXT:    mv a7, a2
@@ -1714,7 +1585,6 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) noun
 ; RV64I-SFB-LABEL: test_i64_store_64:
 ; RV64I-SFB:       # %bb.0: # %entry
 ; RV64I-SFB-NEXT:    ld a0, 32(a0)
-; RV64I-SFB-NEXT:    andi a1, a1, 1
 ; RV64I-SFB-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
 ; RV64I-SFB-NEXT:    mv a0, a2
@@ -1726,7 +1596,6 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) noun
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
 ; RV32I-SFBILOAD-NEXT:    lw a7, 32(a0)
 ; RV32I-SFBILOAD-NEXT:    lw t0, 36(a0)
-; RV32I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV32I-SFBILOAD-NEXT:    mv a7, a2
@@ -1744,7 +1613,6 @@ define i64 @test_i64_store_64(ptr %base, i1 %x, i64 %b, ptr %base1, i64 %c) noun
 ; RV64I-SFBILOAD-LABEL: test_i64_store_64:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
 ; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
-; RV64I-SFBILOAD-NEXT:    andi a1, a1, 1
 ; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB21_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
 ; RV64I-SFBILOAD-NEXT:    mv a0, a2
@@ -1759,3 +1627,4886 @@ entry:
   ret i64 %res
 }
 
+define i32 @test_i8_s_volatile(ptr %base, i1 zeroext %x, i32 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i8_s_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a4, 4(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB22_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB22_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a4, 4(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB22_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB22_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    bnez a1, .LBB22_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB22_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB22_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB22_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB22_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB22_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB22_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB22_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+define i32 @test_i8_z_volatile(ptr %base, i1 zeroext %x, i32 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i8_z_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB23_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB23_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB23_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB23_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    bnez a1, .LBB23_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB23_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB23_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB23_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB23_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB23_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB23_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB23_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+define i32 @test_i16_s_volatile(ptr %base, i1 zeroext %x, i32 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i16_s_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a4, 8(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB24_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB24_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a4, 8(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB24_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB24_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    bnez a1, .LBB24_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB24_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB24_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB24_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB24_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB24_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB24_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB24_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+define i32 @test_i16_z_volatile(ptr %base, i1 zeroext %x, i32 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i16_z_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a4, 8(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB25_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB25_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a4, 8(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB25_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB25_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    bnez a1, .LBB25_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB25_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB25_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB25_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB25_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB25_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB25_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB25_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+define i32 @test_i32_volatile(ptr %base, i1 zeroext %x, i32 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i32_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a4, 16(a0)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB26_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a4, a2
+; RV32I-NEXT:  .LBB26_2: # %entry
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lw a4, 16(a0)
+; RV64I-NEXT:    lw a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB26_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB26_2: # %entry
+; RV64I-NEXT:    addw a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    lw a3, 0(a3)
+; RV32I-SFB-NEXT:    bnez a1, .LBB26_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB26_2: # %entry
+; RV32I-SFB-NEXT:    add a0, a0, a3
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    lw a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB26_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB26_2: # %entry
+; RV64I-SFB-NEXT:    addw a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB26_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB26_2: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB26_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB26_2: # %entry
+; RV64I-SFBILOAD-NEXT:    addw a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load i32, ptr %addr          ; load 32-bit value
+  %val1 = load volatile i32, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  %res1 = add i32 %res, %val1
+  ret i32 %res1
+}
+
+
+define i64 @test_i8_s_1_volatile(ptr %base, i1 zeroext %x, i64 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i8_s_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lb a6, 4(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    bnez a1, .LBB27_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:    j .LBB27_3
+; RV32I-NEXT:  .LBB27_2:
+; RV32I-NEXT:    srai a3, a6, 31
+; RV32I-NEXT:  .LBB27_3: # %entry
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    sltu a1, a0, a6
+; RV32I-NEXT:    add a3, a3, a5
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lb a4, 4(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB27_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB27_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    lw a5, 4(a4)
+; RV32I-SFB-NEXT:    lw a4, 0(a4)
+; RV32I-SFB-NEXT:    beqz a1, .LBB27_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB27_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB27_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB27_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a2, a4
+; RV32I-SFB-NEXT:    sltu a1, a0, a2
+; RV32I-SFB-NEXT:    add a3, a3, a5
+; RV32I-SFB-NEXT:    add a1, a3, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB27_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB27_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB27_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB27_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB27_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB27_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
+; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB27_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lb a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB27_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i8_z_1_volatile(ptr %base, i1 zeroext %x, i64 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i8_z_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    bnez a1, .LBB28_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:  .LBB28_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a5
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB28_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB28_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lbu a5, 4(a0)
+; RV32I-SFB-NEXT:    lw a6, 4(a4)
+; RV32I-SFB-NEXT:    lw a0, 0(a4)
+; RV32I-SFB-NEXT:    bnez a1, .LBB28_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a5, a2
+; RV32I-SFB-NEXT:  .LBB28_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB28_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    add a6, a6, a3
+; RV32I-SFB-NEXT:  .LBB28_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a5, a0
+; RV32I-SFB-NEXT:    sltu a1, a0, a5
+; RV32I-SFB-NEXT:    add a1, a6, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB28_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB28_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB28_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB28_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB28_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
+; RV32I-SFBILOAD-NEXT:  .LBB28_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a1, a5, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB28_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lbu a2, 4(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB28_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load i8, ptr %addr          ; load 8-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i16_s_1_volatile(ptr %base, i1 zeroext %x, i64 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i16_s_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lh a6, 8(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    bnez a1, .LBB29_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:    j .LBB29_3
+; RV32I-NEXT:  .LBB29_2:
+; RV32I-NEXT:    srai a3, a6, 31
+; RV32I-NEXT:  .LBB29_3: # %entry
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    sltu a1, a0, a6
+; RV32I-NEXT:    add a3, a3, a5
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lh a4, 8(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB29_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB29_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    lw a5, 4(a4)
+; RV32I-SFB-NEXT:    lw a4, 0(a4)
+; RV32I-SFB-NEXT:    beqz a1, .LBB29_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai a3, a0, 31
+; RV32I-SFB-NEXT:  .LBB29_2: # %entry
+; RV32I-SFB-NEXT:    beqz a1, .LBB29_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a2, a0
+; RV32I-SFB-NEXT:  .LBB29_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a2, a4
+; RV32I-SFB-NEXT:    sltu a1, a0, a2
+; RV32I-SFB-NEXT:    add a3, a3, a5
+; RV32I-SFB-NEXT:    add a1, a3, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB29_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB29_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB29_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB29_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB29_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB29_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
+; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB29_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lh a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB29_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i16_z_1_volatile(ptr %base, i1 zeroext %x, i64 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i16_z_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lhu a6, 8(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    bnez a1, .LBB30_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:  .LBB30_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a5
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lhu a4, 8(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB30_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB30_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lhu a5, 8(a0)
+; RV32I-SFB-NEXT:    lw a6, 4(a4)
+; RV32I-SFB-NEXT:    lw a0, 0(a4)
+; RV32I-SFB-NEXT:    bnez a1, .LBB30_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a5, a2
+; RV32I-SFB-NEXT:  .LBB30_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB30_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    add a6, a6, a3
+; RV32I-SFB-NEXT:  .LBB30_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a5, a0
+; RV32I-SFB-NEXT:    sltu a1, a0, a5
+; RV32I-SFB-NEXT:    add a1, a6, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB30_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB30_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB30_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB30_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB30_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
+; RV32I-SFBILOAD-NEXT:  .LBB30_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a1, a5, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB30_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lhu a2, 8(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB30_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load i16, ptr %addr          ; load 16-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i32_z_1_volatile(ptr %base, i1 zeroext %x, i64 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i32_z_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a6, 16(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    bnez a1, .LBB31_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:  .LBB31_2: # %entry
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a5
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lwu a4, 16(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB31_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB31_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a5, 16(a0)
+; RV32I-SFB-NEXT:    lw a6, 4(a4)
+; RV32I-SFB-NEXT:    lw a0, 0(a4)
+; RV32I-SFB-NEXT:    bnez a1, .LBB31_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a5, a2
+; RV32I-SFB-NEXT:  .LBB31_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB31_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    add a6, a6, a3
+; RV32I-SFB-NEXT:  .LBB31_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a5, a0
+; RV32I-SFB-NEXT:    sltu a1, a0, a5
+; RV32I-SFB-NEXT:    add a1, a6, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB31_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB31_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB31_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 16(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB31_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB31_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    add a5, a5, a3
+; RV32I-SFBILOAD-NEXT:  .LBB31_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a1, a5, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB31_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu a2, 16(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB31_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load i32, ptr %addr          ; load 32-bit value
+  %val1 = load volatile i64, ptr %base1
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i64 @test_i64_1_volatile(ptr %base, i1 zeroext %x, i64 %b, ptr %base1) nounwind {
+; RV32I-LABEL: test_i64_1_volatile:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a7, 32(a0)
+; RV32I-NEXT:    lw a6, 36(a0)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a0, 0(a4)
+; RV32I-NEXT:    bnez a1, .LBB32_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a6, a3
+; RV32I-NEXT:    mv a7, a2
+; RV32I-NEXT:  .LBB32_2: # %entry
+; RV32I-NEXT:    add a0, a7, a0
+; RV32I-NEXT:    sltu a1, a0, a7
+; RV32I-NEXT:    add a5, a6, a5
+; RV32I-NEXT:    add a1, a5, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1_volatile:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    ld a4, 32(a0)
+; RV64I-NEXT:    ld a0, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB32_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a4, a2
+; RV64I-NEXT:  .LBB32_2: # %entry
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1_volatile:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    lw a5, 32(a0)
+; RV32I-SFB-NEXT:    lw a6, 36(a0)
+; RV32I-SFB-NEXT:    lw a7, 4(a4)
+; RV32I-SFB-NEXT:    lw a0, 0(a4)
+; RV32I-SFB-NEXT:    bnez a1, .LBB32_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a6, a3
+; RV32I-SFB-NEXT:  .LBB32_2: # %entry
+; RV32I-SFB-NEXT:    bnez a1, .LBB32_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a5, a2
+; RV32I-SFB-NEXT:  .LBB32_4: # %entry
+; RV32I-SFB-NEXT:    add a0, a5, a0
+; RV32I-SFB-NEXT:    sltu a1, a0, a5
+; RV32I-SFB-NEXT:    add a6, a6, a7
+; RV32I-SFB-NEXT:    add a1, a6, a1
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1_volatile:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    ld a3, 0(a3)
+; RV64I-SFB-NEXT:    bnez a1, .LBB32_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, a2
+; RV64I-SFB-NEXT:  .LBB32_2: # %entry
+; RV64I-SFB-NEXT:    add a0, a0, a3
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1_volatile:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a5, 4(a4)
+; RV32I-SFBILOAD-NEXT:    lw a4, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB32_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a2, 32(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB32_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB32_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    lw a3, 36(a0)
+; RV32I-SFBILOAD-NEXT:  .LBB32_4: # %entry
+; RV32I-SFBILOAD-NEXT:    add a0, a2, a4
+; RV32I-SFBILOAD-NEXT:    sltu a1, a0, a2
+; RV32I-SFBILOAD-NEXT:    add a3, a3, a5
+; RV32I-SFBILOAD-NEXT:    add a1, a3, a1
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1_volatile:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a3, 0(a3)
+; RV64I-SFBILOAD-NEXT:    beqz a1, .LBB32_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    ld a2, 32(a0)
+; RV64I-SFBILOAD-NEXT:  .LBB32_2: # %entry
+; RV64I-SFBILOAD-NEXT:    add a0, a2, a3
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load i64, ptr %addr          ; load 64-bit value
+  %val1 = load volatile i64, ptr %base1
+  %res = select i1 %x, i64 %val, i64 %b
+  %res1 = add i64 %res, %val1
+  ret i64 %res1
+}
+
+define i32 @test_i8_s_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i8_s_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s1, .LBB33_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:  .LBB33_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB33_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB33_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    mv s1, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s1, .LBB33_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 24
+; RV32I-SFB-NEXT:  .LBB33_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s1, .LBB33_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:  .LBB33_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    mv s1, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB33_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB33_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB33_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB33_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i8_z_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s1, .LBB34_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:  .LBB34_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB34_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB34_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    mv s1, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s1, .LBB34_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    zext.b s0, a0
+; RV32I-SFB-NEXT:  .LBB34_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s1, .LBB34_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:  .LBB34_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    mv s1, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB34_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV32I-SFBILOAD-NEXT:  .LBB34_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB34_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:  .LBB34_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i16_s_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s1, .LBB35_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:  .LBB35_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB35_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB35_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    mv s1, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s1, .LBB35_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 16
+; RV32I-SFB-NEXT:  .LBB35_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB35_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB35_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    mv s1, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB35_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB35_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB35_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB35_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i16_z_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s1, .LBB36_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:  .LBB36_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB36_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB36_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    mv s1, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s1, .LBB36_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srli s0, a0, 16
+; RV32I-SFB-NEXT:  .LBB36_2: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB36_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB36_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    mv s1, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB36_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s0, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB36_2: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB36_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB36_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
+; RV32I-LABEL: test_i32_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    bnez s1, .LBB37_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:  .LBB37_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 16
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    bnez s1, .LBB37_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB37_2: # %entry
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a2
+; RV32I-SFB-NEXT:    mv s1, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 16
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_4
+; RV32I-SFB-NEXT:    bnez s1, .LBB37_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:  .LBB37_2: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 16
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_4
+; RV64I-SFB-NEXT:    bnez s1, .LBB37_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB37_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a2
+; RV32I-SFBILOAD-NEXT:    mv s1, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV32I-SFBILOAD-NEXT:    bnez s1, .LBB37_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:  .LBB37_2: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB37_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:  .LBB37_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr monotonic, align 4          ; load 32-bit value
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_s_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i8_s_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB38_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:  .LBB38_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB38_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB38_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    mv s3, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s3, .LBB38_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s2, a0, 24
+; RV32I-SFB-NEXT:  .LBB38_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s3, .LBB38_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:  .LBB38_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    mv s3, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB38_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s2, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB38_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB38_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB38_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i8_z_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i8_z_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB39_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:  .LBB39_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB39_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB39_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    mv s3, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s3, .LBB39_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    zext.b s2, a0
+; RV32I-SFB-NEXT:  .LBB39_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s3, .LBB39_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:  .LBB39_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    mv s3, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB39_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV32I-SFBILOAD-NEXT:  .LBB39_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB39_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:  .LBB39_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_s_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i16_s_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB40_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:  .LBB40_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB40_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB40_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    mv s3, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s3, .LBB40_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s2, a0, 16
+; RV32I-SFB-NEXT:  .LBB40_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB40_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB40_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    mv s3, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB40_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s2, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB40_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB40_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB40_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i32         ; sign-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i16_z_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i16_z_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    beqz s3, .LBB41_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:  .LBB41_2: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB41_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB41_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    mv s3, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s3, .LBB41_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srli s2, a0, 16
+; RV32I-SFB-NEXT:  .LBB41_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB41_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB41_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    mv s3, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB41_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s2, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB41_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB41_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB41_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i32         ; zero-extend to 32 bits
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %ext, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_i32_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
+; RV32I-LABEL: test_i32_store_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s1, a4
+; RV32I-NEXT:    mv s2, a3
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    sw s1, 0(s2)
+; RV32I-NEXT:    bnez s3, .LBB42_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:  .LBB42_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_store_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 16
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    sw s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB42_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB42_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_store_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a4
+; RV32I-SFB-NEXT:    mv s1, a3
+; RV32I-SFB-NEXT:    mv s2, a2
+; RV32I-SFB-NEXT:    mv s3, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 16
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_4
+; RV32I-SFB-NEXT:    bnez s3, .LBB42_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:  .LBB42_2: # %entry
+; RV32I-SFB-NEXT:    sw s0, 0(s1)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_store_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 16
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_4
+; RV64I-SFB-NEXT:    bnez s3, .LBB42_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB42_2: # %entry
+; RV64I-SFB-NEXT:    sw s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_store_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a4
+; RV32I-SFBILOAD-NEXT:    mv s1, a3
+; RV32I-SFBILOAD-NEXT:    mv s2, a2
+; RV32I-SFBILOAD-NEXT:    mv s3, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV32I-SFBILOAD-NEXT:    bnez s3, .LBB42_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:  .LBB42_2: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_store_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB42_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB42_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr monotonic, align 4          ; load 32-bit value
+  store i32 %c, ptr %base1
+  %res = select i1 %x, i32 %val, i32 %b
+  ret i32 %res
+}
+
+define i64 @test_i8_s_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i8_s_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s2, .LBB43_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 24
+; RV32I-NEXT:  .LBB43_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB43_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB43_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s2, .LBB43_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s1, a0, 24
+; RV32I-SFB-NEXT:  .LBB43_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB43_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:  .LBB43_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s1, .LBB43_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:  .LBB43_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB43_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s1, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB43_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB43_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB43_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB43_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB43_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i8_z_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    beqz s2, .LBB44_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:  .LBB44_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    beqz s1, .LBB44_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB44_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s2, .LBB44_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB44_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB44_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    zext.b s1, a0
+; RV32I-SFB-NEXT:  .LBB44_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s1, .LBB44_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:  .LBB44_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB44_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB44_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB44_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s1, a0
+; RV32I-SFBILOAD-NEXT:  .LBB44_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB44_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:  .LBB44_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i16_s_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s2, .LBB45_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 16
+; RV32I-NEXT:  .LBB45_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB45_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB45_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s2, .LBB45_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s1, a0, 16
+; RV32I-SFB-NEXT:  .LBB45_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB45_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:  .LBB45_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB45_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB45_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB45_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s1, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB45_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB45_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB45_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB45_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB45_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i16_z_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    beqz s2, .LBB46_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:  .LBB46_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    beqz s1, .LBB46_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB46_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s2, .LBB46_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB46_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB46_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srli s1, a0, 16
+; RV32I-SFB-NEXT:  .LBB46_4: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s1, .LBB46_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:  .LBB46_2: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB46_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB46_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB46_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s1, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB46_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB46_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB46_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i32_z_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    addi a2, sp, 12
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    beqz s2, .LBB47_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:  .LBB47_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 0
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    beqz s1, .LBB47_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:  .LBB47_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    addi a2, sp, 12
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 12(sp)
+; RV32I-SFB-NEXT:    bnez s2, .LBB47_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB47_2: # %entry
+; RV32I-SFB-NEXT:    beqz s2, .LBB47_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:  .LBB47_4: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 0
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s1, .LBB47_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB47_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB47_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:  .LBB47_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB47_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:  .LBB47_4: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB47_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:  .LBB47_2: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr monotonic, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
+; RV32I-LABEL: test_i64_1_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    bnez s2, .LBB48_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB48_2: # %entry
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_1_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    addi a0, a0, 32
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    bnez s1, .LBB48_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB48_2: # %entry
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_1_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -16
+; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a3
+; RV32I-SFB-NEXT:    mv s1, a2
+; RV32I-SFB-NEXT:    mv s2, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    bnez s2, .LBB48_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:  .LBB48_2: # %entry
+; RV32I-SFB-NEXT:    bnez s2, .LBB48_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s0
+; RV32I-SFB-NEXT:  .LBB48_4: # %entry
+; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_1_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -32
+; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a2
+; RV64I-SFB-NEXT:    mv s1, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 32
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_8
+; RV64I-SFB-NEXT:    bnez s1, .LBB48_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:  .LBB48_2: # %entry
+; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_1_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
+; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a3
+; RV32I-SFBILOAD-NEXT:    mv s1, a2
+; RV32I-SFBILOAD-NEXT:    mv s2, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB48_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s1
+; RV32I-SFBILOAD-NEXT:  .LBB48_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s2, .LBB48_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s0
+; RV32I-SFBILOAD-NEXT:  .LBB48_4: # %entry
+; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_1_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a2
+; RV64I-SFBILOAD-NEXT:    mv s1, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB48_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:  .LBB48_2: # %entry
+; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr monotonic, align 8          ; load 64-bit value
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_s_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i8_s_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB49_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 24
+; RV32I-NEXT:  .LBB49_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_s_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB49_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:  .LBB49_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_s_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    slli a0, a0, 24
+; RV32I-SFB-NEXT:    beqz s5, .LBB49_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s4, a0, 24
+; RV32I-SFB-NEXT:  .LBB49_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB49_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:  .LBB49_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_s_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    slli a0, a0, 56
+; RV64I-SFB-NEXT:    beqz s3, .LBB49_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:  .LBB49_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB49_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s4, a0, 24
+; RV32I-SFBILOAD-NEXT:  .LBB49_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB49_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB49_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB49_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:  .LBB49_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = sext i8 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i8_z_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i8_z_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s3, a6
+; RV32I-NEXT:    mv s4, a5
+; RV32I-NEXT:    mv s5, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 4
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_1
+; RV32I-NEXT:    sw s4, 0(s5)
+; RV32I-NEXT:    sw s3, 4(s5)
+; RV32I-NEXT:    beqz s2, .LBB50_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:  .LBB50_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i8_z_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 4
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_1
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB50_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:  .LBB50_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i8_z_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 4
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_1
+; RV32I-SFB-NEXT:    beqz s5, .LBB50_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB50_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB50_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    zext.b s4, a0
+; RV32I-SFB-NEXT:  .LBB50_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i8_z_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 4
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_1
+; RV64I-SFB-NEXT:    beqz s3, .LBB50_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:  .LBB50_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB50_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB50_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB50_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    zext.b s4, a0
+; RV32I-SFBILOAD-NEXT:  .LBB50_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB50_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:  .LBB50_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i8, ptr %addr monotonic, align 1          ; load 8-bit value
+  %ext = zext i8 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_s_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i16_s_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    beqz s5, .LBB51_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    srai s1, a0, 16
+; RV32I-NEXT:  .LBB51_2: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_s_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB51_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:  .LBB51_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_s_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s5, .LBB51_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    srai s4, a0, 16
+; RV32I-SFB-NEXT:  .LBB51_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB51_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:  .LBB51_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_s_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB51_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB51_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB51_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s4, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB51_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB51_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:  .LBB51_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB51_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB51_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = sext i16 %val to i64         ; sign-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i16_z_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i16_z_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s3, a6
+; RV32I-NEXT:    mv s4, a5
+; RV32I-NEXT:    mv s5, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    addi a0, a0, 8
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_2
+; RV32I-NEXT:    sw s4, 0(s5)
+; RV32I-NEXT:    sw s3, 4(s5)
+; RV32I-NEXT:    beqz s2, .LBB52_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:  .LBB52_2: # %entry
+; RV32I-NEXT:    addi a1, s2, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i16_z_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_2
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    beqz s3, .LBB52_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:  .LBB52_2: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i16_z_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 8
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_2
+; RV32I-SFB-NEXT:    slli a0, a0, 16
+; RV32I-SFB-NEXT:    beqz s5, .LBB52_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB52_2: # %entry
+; RV32I-SFB-NEXT:    beqz s5, .LBB52_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    srli s4, a0, 16
+; RV32I-SFB-NEXT:  .LBB52_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i16_z_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 8
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_2
+; RV64I-SFB-NEXT:    slli a0, a0, 48
+; RV64I-SFB-NEXT:    beqz s3, .LBB52_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:  .LBB52_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB52_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB52_2: # %entry
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB52_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    srli s4, a0, 16
+; RV32I-SFBILOAD-NEXT:  .LBB52_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
+; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
+; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB52_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:  .LBB52_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i16, ptr %addr monotonic, align 2          ; load 16-bit value
+  %ext = zext i16 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i32_z_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i32_z_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s3, a6
+; RV32I-NEXT:    mv s4, a5
+; RV32I-NEXT:    mv s5, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s2, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    addi a1, a0, 16
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    mv a2, sp
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __atomic_load
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    sw s4, 0(s5)
+; RV32I-NEXT:    sw s3, 4(s5)
+; RV32I-NEXT:    bnez s1, .LBB53_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:  .LBB53_2: # %entry
+; RV32I-NEXT:    addi a1, s1, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i32_z_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a1, a0, 16
+; RV64I-NEXT:    li a0, 4
+; RV64I-NEXT:    addi a2, sp, 4
+; RV64I-NEXT:    li a3, 0
+; RV64I-NEXT:    call __atomic_load
+; RV64I-NEXT:    lwu a0, 4(sp)
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB53_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB53_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i32_z_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a1, a0, 16
+; RV32I-SFB-NEXT:    li a0, 4
+; RV32I-SFB-NEXT:    mv a2, sp
+; RV32I-SFB-NEXT:    li a3, 0
+; RV32I-SFB-NEXT:    call __atomic_load
+; RV32I-SFB-NEXT:    lw a0, 0(sp)
+; RV32I-SFB-NEXT:    beqz s5, .LBB53_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:  .LBB53_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB53_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB53_4: # %entry
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i32_z_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a1, a0, 16
+; RV64I-SFB-NEXT:    li a0, 4
+; RV64I-SFB-NEXT:    addi a2, sp, 4
+; RV64I-SFB-NEXT:    li a3, 0
+; RV64I-SFB-NEXT:    call __atomic_load
+; RV64I-SFB-NEXT:    lwu a0, 4(sp)
+; RV64I-SFB-NEXT:    bnez s3, .LBB53_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB53_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV32I-SFBILOAD-NEXT:    li a0, 4
+; RV32I-SFBILOAD-NEXT:    mv a2, sp
+; RV32I-SFBILOAD-NEXT:    li a3, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load
+; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
+; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB53_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:  .LBB53_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB53_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB53_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
+; RV64I-SFBILOAD-NEXT:    li a0, 4
+; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
+; RV64I-SFBILOAD-NEXT:    li a3, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load
+; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB53_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB53_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i32, ptr %addr monotonic, align 2          ; load 32-bit value
+  %ext = zext i32 %val to i64         ; zero-extend to 64 bits
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %ext, i64 %b
+  ret i64 %res
+}
+
+define i64 @test_i64_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
+; RV32I-LABEL: test_i64_store_64_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s2, a6
+; RV32I-NEXT:    mv s3, a5
+; RV32I-NEXT:    mv s4, a4
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw s3, 0(s4)
+; RV32I-NEXT:    sw s2, 4(s4)
+; RV32I-NEXT:    bnez s5, .LBB54_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:  .LBB54_2: # %entry
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_i64_store_64_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s1, a4
+; RV64I-NEXT:    mv s2, a3
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s3, a1
+; RV64I-NEXT:    addi a0, a0, 32
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    sd s1, 0(s2)
+; RV64I-NEXT:    bnez s3, .LBB54_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:  .LBB54_2: # %entry
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: test_i64_store_64_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    addi sp, sp, -32
+; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFB-NEXT:    mv s0, a6
+; RV32I-SFB-NEXT:    mv s1, a5
+; RV32I-SFB-NEXT:    mv s2, a4
+; RV32I-SFB-NEXT:    mv s3, a3
+; RV32I-SFB-NEXT:    mv s4, a2
+; RV32I-SFB-NEXT:    mv s5, a1
+; RV32I-SFB-NEXT:    addi a0, a0, 32
+; RV32I-SFB-NEXT:    li a1, 0
+; RV32I-SFB-NEXT:    call __atomic_load_8
+; RV32I-SFB-NEXT:    sw s1, 0(s2)
+; RV32I-SFB-NEXT:    bnez s5, .LBB54_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:  .LBB54_2: # %entry
+; RV32I-SFB-NEXT:    bnez s5, .LBB54_4
+; RV32I-SFB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-NEXT:    mv a1, s3
+; RV32I-SFB-NEXT:  .LBB54_4: # %entry
+; RV32I-SFB-NEXT:    sw s0, 4(s2)
+; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: test_i64_store_64_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    addi sp, sp, -48
+; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFB-NEXT:    mv s0, a4
+; RV64I-SFB-NEXT:    mv s1, a3
+; RV64I-SFB-NEXT:    mv s2, a2
+; RV64I-SFB-NEXT:    mv s3, a1
+; RV64I-SFB-NEXT:    addi a0, a0, 32
+; RV64I-SFB-NEXT:    li a1, 0
+; RV64I-SFB-NEXT:    call __atomic_load_8
+; RV64I-SFB-NEXT:    bnez s3, .LBB54_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:  .LBB54_2: # %entry
+; RV64I-SFB-NEXT:    sd s0, 0(s1)
+; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    ret
+;
+; RV32I-SFBILOAD-LABEL: test_i64_store_64_2:
+; RV32I-SFBILOAD:       # %bb.0: # %entry
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
+; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-SFBILOAD-NEXT:    mv s0, a6
+; RV32I-SFBILOAD-NEXT:    mv s1, a5
+; RV32I-SFBILOAD-NEXT:    mv s2, a4
+; RV32I-SFBILOAD-NEXT:    mv s3, a3
+; RV32I-SFBILOAD-NEXT:    mv s4, a2
+; RV32I-SFBILOAD-NEXT:    mv s5, a1
+; RV32I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV32I-SFBILOAD-NEXT:    li a1, 0
+; RV32I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB54_2
+; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:  .LBB54_2: # %entry
+; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB54_4
+; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
+; RV32I-SFBILOAD-NEXT:    mv a1, s3
+; RV32I-SFBILOAD-NEXT:  .LBB54_4: # %entry
+; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
+; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    ret
+;
+; RV64I-SFBILOAD-LABEL: test_i64_store_64_2:
+; RV64I-SFBILOAD:       # %bb.0: # %entry
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
+; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-SFBILOAD-NEXT:    mv s0, a4
+; RV64I-SFBILOAD-NEXT:    mv s1, a3
+; RV64I-SFBILOAD-NEXT:    mv s2, a2
+; RV64I-SFBILOAD-NEXT:    mv s3, a1
+; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
+; RV64I-SFBILOAD-NEXT:    li a1, 0
+; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
+; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB54_2
+; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
+; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:  .LBB54_2: # %entry
+; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
+; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    ret
+entry:
+  %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
+  %val = load atomic i64, ptr %addr monotonic, align 8          ; load 64-bit value
+  store i64 %c, ptr %base1
+  %res = select i1 %x, i64 %val, i64 %b
+  ret i64 %res
+}
+

>From 52e47be331dd7b0dfb506a81098f4ad665cc02e4 Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Mon, 8 Dec 2025 11:25:29 +0530
Subject: [PATCH 09/11] fixup! Address comments

Change-Id: I19e55ada194ba2616c77eb82174ca059cbff29d2
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 11688476a2554..27b9f45877c40 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -926,24 +926,19 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
   if (MI.getOpcode() != RISCV::PseudoCCMOVGPR)
     return nullptr;
 
-  if (!STI.hasShortForwardBranchILoad() ||
-      (LoadMI.getOpcode() != RISCV::LB && LoadMI.getOpcode() != RISCV::LBU &&
-       LoadMI.getOpcode() != RISCV::LH && LoadMI.getOpcode() != RISCV::LHU &&
-       LoadMI.getOpcode() != RISCV::LW && LoadMI.getOpcode() != RISCV::LWU &&
-       LoadMI.getOpcode() != RISCV::LD))
+  unsigned PredOpc = getLoadPredicatedOpcode(LoadMI.getOpcode());
+
+  if (!STI.hasShortForwardBranchILoad() || !PredOpc)
     return nullptr;
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool Invert = MRI.getVRegDef(MI.getOperand(4).getReg()) == &LoadMI;
-  MachineOperand FalseReg = MI.getOperand(Invert ? 5 : 4);
+  const MachineOperand &FalseReg = MI.getOperand(Invert ? 5 : 4);
   Register DestReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
   if (!MRI.constrainRegClass(DestReg, PreviousClass))
     return nullptr;
 
-  unsigned PredOpc = getLoadPredicatedOpcode(LoadMI.getOpcode());
-  assert(PredOpc != 0 && "Unexpected opcode!");
-
   // Create a new predicated version of DefMI.
   MachineInstrBuilder NewMI = BuildMI(*MI.getParent(), InsertPt,
                                       MI.getDebugLoc(), get(PredOpc), DestReg);

>From 65403e2d08bc9decc3b13ed1220c3352163f8ddd Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Mon, 8 Dec 2025 12:55:56 +0530
Subject: [PATCH 10/11] fixup! Address comments

Change-Id: I3098b45d7ca8b2cb164b56ecd92a46aca3dccf74
---
 llvm/lib/Target/RISCV/RISCVFeatures.td        |    2 -
 ...-branch-opt-load-atomic-acquire-seq_cst.ll |  782 +---
 .../RISCV/short-forward-branch-opt-load.ll    | 3405 +++--------------
 3 files changed, 760 insertions(+), 3429 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index dd7f36136ea4e..8425a9a231e97 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1928,8 +1928,6 @@ def TuneShortForwardBranchIMul
                        [TuneShortForwardBranchIALU]>;
 def HasShortForwardBranchIMul : Predicate<"Subtarget->hasShortForwardBranchIMul()">;
 
-
-
 def TuneShortForwardBranchILoad
     : SubtargetFeature<"short-forward-branch-iload", "HasShortForwardBranchILoad",
                        "true", "Enable short forward branch optimization for load instructions",
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire-seq_cst.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire-seq_cst.ll
index d4e418ebb8fd3..d8217fa397a3c 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire-seq_cst.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load-atomic-acquire-seq_cst.ll
@@ -1052,171 +1052,78 @@ entry:
 define i64 @test_i32_z_1_3(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i32_z_1_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    addi a2, sp, 12
-; RV32I-NEXT:    li a3, 2
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    beqz s2, .LBB14_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB14_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB14_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_1_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 2
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    beqz s1, .LBB14_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:    lwu a0, 16(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB14_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB14_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_1_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    mv s2, a1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    addi a2, sp, 12
-; RV32I-SFB-NEXT:    li a3, 2
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 12(sp)
-; RV32I-SFB-NEXT:    bnez s2, .LBB14_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB14_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB14_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB14_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB14_4: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_1_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 2
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s1, .LBB14_2
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB14_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB14_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_1_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    mv s2, a1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
-; RV32I-SFBILOAD-NEXT:    li a3, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB14_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB14_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB14_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB14_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_1_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB14_2
+; RV64I-SFBILOAD-NEXT:    lwu a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB14_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB14_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr acquire, align 2          ; load 32-bit value
+  %val = load atomic i32, ptr %addr acquire, align 4          ; load 32-bit value
   %ext = zext i32 %val to i64         ; zero-extend to 64 bits
   %res = select i1 %x, i64 %ext, i64 %b
   ret i64 %res
@@ -1709,225 +1616,87 @@ entry:
 define i64 @test_i32_z_store_64_3(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i32_z_store_64_3:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s3, a6
-; RV32I-NEXT:    mv s4, a5
-; RV32I-NEXT:    mv s5, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    mv a2, sp
-; RV32I-NEXT:    li a3, 2
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    lw a0, 0(sp)
-; RV32I-NEXT:    sw s4, 0(s5)
-; RV32I-NEXT:    sw s3, 4(s5)
-; RV32I-NEXT:    bnez s1, .LBB20_2
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB20_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB20_2: # %entry
-; RV32I-NEXT:    addi a1, s1, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_store_64_3:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 2
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    lwu a0, 4(sp)
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB20_2
+; RV64I-NEXT:    lwu a0, 16(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB20_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB20_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_store_64_3:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    mv s5, a1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    mv a2, sp
-; RV32I-SFB-NEXT:    li a3, 2
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 0(sp)
-; RV32I-SFB-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB20_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB20_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB20_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB20_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_store_64_3:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 2
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB20_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB20_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_3:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    mv s5, a1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    mv a2, sp
-; RV32I-SFBILOAD-NEXT:    li a3, 2
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB20_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB20_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB20_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB20_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB20_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB20_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_3:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 2
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB20_2
+; RV64I-SFBILOAD-NEXT:    lwu a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB20_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB20_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr acquire, align 2          ; load 32-bit value
+  %val = load atomic i32, ptr %addr acquire, align 4          ; load 32-bit value
   %ext = zext i32 %val to i64         ; zero-extend to 64 bits
   store i64 %c, ptr %base1
   %res = select i1 %x, i64 %ext, i64 %b
@@ -3215,171 +2984,84 @@ entry:
 define i64 @test_i32_z_1_4(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i32_z_1_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    addi a2, sp, 12
-; RV32I-NEXT:    li a3, 5
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    beqz s2, .LBB36_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    bnez a1, .LBB36_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB36_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_1_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 5
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    beqz s1, .LBB36_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lwu a0, 16(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    bnez a1, .LBB36_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB36_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_1_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    mv s2, a1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    addi a2, sp, 12
-; RV32I-SFB-NEXT:    li a3, 5
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 12(sp)
-; RV32I-SFB-NEXT:    bnez s2, .LBB36_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB36_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB36_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB36_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB36_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB36_4: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_1_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 5
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s1, .LBB36_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB36_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB36_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
+; RV64I-SFB-NEXT:    fence r, rw
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_1_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    mv s2, a1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
-; RV32I-SFBILOAD-NEXT:    li a3, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB36_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB36_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB36_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB36_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB36_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB36_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_1_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB36_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lwu a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB36_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB36_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV64I-SFBILOAD-NEXT:    fence r, rw
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr seq_cst, align 2          ; load 32-bit value
+  %val = load atomic i32, ptr %addr seq_cst, align 4          ; load 32-bit value
   %ext = zext i32 %val to i64         ; zero-extend to 64 bits
   %res = select i1 %x, i64 %ext, i64 %b
   ret i64 %res
@@ -3899,225 +3581,93 @@ entry:
 define i64 @test_i32_z_store_64_4(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i32_z_store_64_4:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s3, a6
-; RV32I-NEXT:    mv s4, a5
-; RV32I-NEXT:    mv s5, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    mv a2, sp
-; RV32I-NEXT:    li a3, 5
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    lw a0, 0(sp)
-; RV32I-NEXT:    sw s4, 0(s5)
-; RV32I-NEXT:    sw s3, 4(s5)
-; RV32I-NEXT:    bnez s1, .LBB42_2
+; RV32I-NEXT:    fence rw, rw
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    fence r, rw
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB42_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB42_2: # %entry
-; RV32I-NEXT:    addi a1, s1, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_store_64_4:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 5
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    lwu a0, 4(sp)
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB42_2
+; RV64I-NEXT:    fence rw, rw
+; RV64I-NEXT:    lwu a0, 16(a0)
+; RV64I-NEXT:    fence r, rw
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB42_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB42_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_store_64_4:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    mv s5, a1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    mv a2, sp
-; RV32I-SFB-NEXT:    li a3, 5
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 0(sp)
-; RV32I-SFB-NEXT:    beqz s5, .LBB42_2
+; RV32I-SFB-NEXT:    fence rw, rw
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    fence r, rw
+; RV32I-SFB-NEXT:    beqz a1, .LBB42_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB42_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB42_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB42_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB42_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_store_64_4:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 5
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s3, .LBB42_2
+; RV64I-SFB-NEXT:    fence rw, rw
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    fence r, rw
+; RV64I-SFB-NEXT:    bnez a1, .LBB42_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB42_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_4:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    mv s5, a1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    mv a2, sp
-; RV32I-SFBILOAD-NEXT:    li a3, 5
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB42_2
+; RV32I-SFBILOAD-NEXT:    fence rw, rw
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    fence r, rw
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB42_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB42_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB42_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB42_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB42_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_4:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 5
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB42_2
+; RV64I-SFBILOAD-NEXT:    fence rw, rw
+; RV64I-SFBILOAD-NEXT:    lwu a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    fence r, rw
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB42_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB42_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr seq_cst, align 2          ; load 32-bit value
+  %val = load atomic i32, ptr %addr seq_cst, align 4          ; load 32-bit value
   %ext = zext i32 %val to i64         ; zero-extend to 64 bits
   store i64 %c, ptr %base1
   %res = select i1 %x, i64 %ext, i64 %b
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
index faf4dd0c57c7f..c64f5318fa3a5 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-load.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 | FileCheck %s --check-prefixes=RV32I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-ialu | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-ialu | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-ialu | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+short-forward-branch-iload | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+a,+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV32I-SFBILOAD
-; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-iload | \
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+a,+short-forward-branch-iload | \
 ; RUN:   FileCheck %s --check-prefixes=RV64I-SFBILOAD
 
 define i32 @test_i8_s(ptr %base, i1 zeroext %x, i32 %b) nounwind {
@@ -2574,140 +2574,56 @@ entry:
 define i32 @test_i8_s_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s1, .LBB33_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    bnez a1, .LBB33_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB33_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB33_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    bnez a1, .LBB33_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB33_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    mv s1, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s1, .LBB33_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB33_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 24
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB33_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s1, .LBB33_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB33_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB33_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    mv s1, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB33_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB33_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB33_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB33_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB33_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB33_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -2720,134 +2636,56 @@ entry:
 define i32 @test_i8_z_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s1, .LBB34_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    bnez a1, .LBB34_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB34_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB34_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    bnez a1, .LBB34_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB34_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    mv s1, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s1, .LBB34_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB34_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    zext.b s0, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB34_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s1, .LBB34_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB34_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB34_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    mv s1, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB34_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB34_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB34_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB34_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB34_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB34_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -2860,140 +2698,56 @@ entry:
 define i32 @test_i16_s_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s1, .LBB35_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    bnez a1, .LBB35_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB35_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB35_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    bnez a1, .LBB35_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB35_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    mv s1, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s1, .LBB35_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB35_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB35_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB35_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB35_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB35_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    mv s1, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB35_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB35_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB35_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB35_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB35_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB35_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -3006,140 +2760,56 @@ entry:
 define i32 @test_i16_z_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s1, .LBB36_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    bnez a1, .LBB36_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB36_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB36_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    bnez a1, .LBB36_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB36_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    mv s1, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s1, .LBB36_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB36_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srli s0, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB36_2: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB36_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB36_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB36_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    mv s1, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s1, .LBB36_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB36_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s0, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB36_2: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB36_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB36_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB36_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -3152,128 +2822,56 @@ entry:
 define i32 @test_i32_2(ptr %base, i1 zeroext %x, i32 %b) nounwind {
 ; RV32I-LABEL: test_i32_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    addi a0, a0, 16
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_4
-; RV32I-NEXT:    bnez s1, .LBB37_2
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    bnez a1, .LBB37_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB37_2: # %entry
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 16
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_4
-; RV64I-NEXT:    bnez s1, .LBB37_2
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    bnez a1, .LBB37_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB37_2: # %entry
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a2
-; RV32I-SFB-NEXT:    mv s1, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 16
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_4
-; RV32I-SFB-NEXT:    bnez s1, .LBB37_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB37_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB37_2: # %entry
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 16
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_4
-; RV64I-SFB-NEXT:    bnez s1, .LBB37_2
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB37_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB37_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a2
-; RV32I-SFBILOAD-NEXT:    mv s1, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV32I-SFBILOAD-NEXT:    bnez s1, .LBB37_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB37_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB37_2: # %entry
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB37_2
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB37_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB37_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
@@ -3285,182 +2883,62 @@ entry:
 define i32 @test_i8_s_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s3, a1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB38_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 24
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB38_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB38_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB38_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB38_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB38_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    mv s3, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s3, .LBB38_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB38_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s2, a0, 24
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB38_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s3, .LBB38_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB38_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB38_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    mv s3, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB38_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB38_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s2, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB38_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB38_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB38_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB38_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -3474,176 +2952,62 @@ entry:
 define i32 @test_i8_z_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s3, a1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB39_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s0, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB39_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB39_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB39_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB39_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB39_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    mv s3, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s3, .LBB39_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB39_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    zext.b s2, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB39_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s3, .LBB39_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB39_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB39_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    mv s3, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB39_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB39_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB39_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB39_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB39_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB39_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i32 4   ; compute base + 4
@@ -3657,182 +3021,62 @@ entry:
 define i32 @test_i16_s_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s3, a1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB40_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 16
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB40_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB40_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB40_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB40_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB40_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    mv s3, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s3, .LBB40_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB40_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s2, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB40_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB40_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB40_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB40_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    mv s3, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB40_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB40_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s2, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB40_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB40_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB40_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB40_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -3846,182 +3090,62 @@ entry:
 define i32 @test_i16_z_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s3, a1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    beqz s3, .LBB41_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s0, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB41_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB41_2: # %entry
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB41_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB41_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB41_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    mv s3, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s3, .LBB41_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB41_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srli s2, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB41_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    mv a0, s2
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB41_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB41_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB41_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    mv s3, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s3, .LBB41_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB41_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s2, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB41_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB41_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB41_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB41_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i32 4   ; compute base + 4
@@ -4035,170 +3159,62 @@ entry:
 define i32 @test_i32_store_2(ptr %base, i1 zeroext %x, i32 %b, ptr %base1, i32 %c) nounwind {
 ; RV32I-LABEL: test_i32_store_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s1, a4
-; RV32I-NEXT:    mv s2, a3
-; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s3, a1
-; RV32I-NEXT:    addi a0, a0, 16
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_4
-; RV32I-NEXT:    sw s1, 0(s2)
-; RV32I-NEXT:    bnez s3, .LBB42_2
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    sw a4, 0(a3)
+; RV32I-NEXT:    bnez a1, .LBB42_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB42_2: # %entry
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_store_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 16
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_4
-; RV64I-NEXT:    sw s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB42_2
+; RV64I-NEXT:    lw a0, 16(a0)
+; RV64I-NEXT:    sw a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB42_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB42_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_store_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a4
-; RV32I-SFB-NEXT:    mv s1, a3
-; RV32I-SFB-NEXT:    mv s2, a2
-; RV32I-SFB-NEXT:    mv s3, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 16
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_4
-; RV32I-SFB-NEXT:    bnez s3, .LBB42_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    bnez a1, .LBB42_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s2
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB42_2: # %entry
-; RV32I-SFB-NEXT:    sw s0, 0(s1)
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a4, 0(a3)
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_store_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 16
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_4
-; RV64I-SFB-NEXT:    bnez s3, .LBB42_2
+; RV64I-SFB-NEXT:    lw a0, 16(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB42_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB42_2: # %entry
-; RV64I-SFB-NEXT:    sw s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sw a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_store_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a4
-; RV32I-SFBILOAD-NEXT:    mv s1, a3
-; RV32I-SFBILOAD-NEXT:    mv s2, a2
-; RV32I-SFBILOAD-NEXT:    mv s3, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV32I-SFBILOAD-NEXT:    bnez s3, .LBB42_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB42_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s2
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB42_2: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_store_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_4
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB42_2
+; RV64I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB42_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB42_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sw s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sw a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i32 4   ; compute base + 4
@@ -4211,161 +3227,71 @@ entry:
 define i64 @test_i8_s_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_s_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s2, .LBB43_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 24
-; RV32I-NEXT:  .LBB43_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    bnez a1, .LBB43_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB43_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB43_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    bnez a1, .LBB43_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB43_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    mv s2, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s2, .LBB43_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB43_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s1, a0, 24
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB43_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB43_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB43_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB43_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s1, .LBB43_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB43_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB43_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    mv s2, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB43_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB43_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s1, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB43_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB43_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB43_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB43_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB43_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB43_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB43_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -4378,155 +3304,68 @@ entry:
 define i64 @test_i8_z_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i8_z_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    beqz s2, .LBB44_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    bnez a1, .LBB44_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB44_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    beqz s1, .LBB44_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    bnez a1, .LBB44_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB44_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    mv s2, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s2, .LBB44_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB44_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB44_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB44_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB44_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    zext.b s1, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB44_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s1, .LBB44_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB44_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s0, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB44_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    mv s2, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB44_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB44_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB44_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB44_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB44_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s1, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB44_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB44_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB44_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s0, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB44_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -4539,161 +3378,71 @@ entry:
 define i64 @test_i16_s_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_s_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s2, .LBB45_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 16
-; RV32I-NEXT:  .LBB45_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    bnez a1, .LBB45_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB45_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB45_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    bnez a1, .LBB45_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB45_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    mv s2, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s2, .LBB45_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB45_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s1, a0, 16
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB45_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB45_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB45_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s0, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB45_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB45_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB45_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB45_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    mv s2, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB45_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB45_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s1, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB45_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB45_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB45_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s0, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB45_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB45_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB45_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB45_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -4706,161 +3455,68 @@ entry:
 define i64 @test_i16_z_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i16_z_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    beqz s2, .LBB46_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    bnez a1, .LBB46_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB46_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    beqz s1, .LBB46_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    bnez a1, .LBB46_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB46_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -16
-; RV32I-SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    mv s2, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s2, .LBB46_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB46_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB46_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB46_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB46_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srli s1, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB46_4: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 16
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s1, .LBB46_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB46_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s0, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB46_2: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -16
-; RV32I-SFBILOAD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    mv s2, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB46_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB46_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB46_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB46_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB46_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s1, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB46_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 16
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB46_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB46_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s0, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB46_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -4873,171 +3529,72 @@ entry:
 define i64 @test_i32_z_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ; RV32I-LABEL: test_i32_z_1_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    addi a2, sp, 12
-; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    beqz s2, .LBB47_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    lw s1, 12(sp)
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    bnez a1, .LBB47_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB47_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 0
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    beqz s1, .LBB47_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    lwu s0, 4(sp)
+; RV64I-NEXT:    lwu a0, 16(a0)
+; RV64I-NEXT:    bnez a1, .LBB47_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB47_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_1_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a3
-; RV32I-SFB-NEXT:    mv s1, a2
-; RV32I-SFB-NEXT:    mv s2, a1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    addi a2, sp, 12
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 12(sp)
-; RV32I-SFB-NEXT:    bnez s2, .LBB47_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB47_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    mv a0, s1
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB47_2: # %entry
-; RV32I-SFB-NEXT:    beqz s2, .LBB47_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB47_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    li s0, 0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB47_4: # %entry
-; RV32I-SFB-NEXT:    mv a1, s0
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 0
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s1, .LBB47_2
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB47_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB47_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_1_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a3
-; RV32I-SFBILOAD-NEXT:    mv s1, a2
-; RV32I-SFBILOAD-NEXT:    mv s2, a1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    addi a2, sp, 12
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB47_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB47_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    lw s1, 12(sp)
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB47_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s2, .LBB47_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB47_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    li s0, 0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB47_4: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s1
-; RV32I-SFBILOAD-NEXT:    mv a1, s0
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    beqz s1, .LBB47_2
+; RV64I-SFBILOAD-NEXT:    lwu a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB47_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    lwu s0, 4(sp)
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB47_2: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr monotonic, align 2          ; load 32-bit value
+  %val = load atomic i32, ptr %addr monotonic, align 4          ; load 32-bit value
   %ext = zext i32 %val to i64         ; zero-extend to 64 bits
   %res = select i1 %x, i64 %ext, i64 %b
   ret i64 %res
@@ -5071,23 +3628,11 @@ define i64 @test_i64_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ;
 ; RV64I-LABEL: test_i64_1_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    addi a0, a0, 32
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_8
-; RV64I-NEXT:    bnez s1, .LBB48_2
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    bnez a1, .LBB48_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB48_2: # %entry
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_1_2:
@@ -5120,23 +3665,11 @@ define i64 @test_i64_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ;
 ; RV64I-SFB-LABEL: test_i64_1_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -32
-; RV64I-SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a2
-; RV64I-SFB-NEXT:    mv s1, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 32
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_8
-; RV64I-SFB-NEXT:    bnez s1, .LBB48_2
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB48_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB48_2: # %entry
-; RV64I-SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 32
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_1_2:
@@ -5169,23 +3702,11 @@ define i64 @test_i64_1_2(ptr %base, i1 zeroext %x, i64 %b) nounwind {
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_1_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV64I-SFBILOAD-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a2
-; RV64I-SFBILOAD-NEXT:    mv s1, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV64I-SFBILOAD-NEXT:    bnez s1, .LBB48_2
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB48_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB48_2: # %entry
-; RV64I-SFBILOAD-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 32
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4
@@ -5197,215 +3718,80 @@ entry:
 define i64 @test_i8_s_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_s_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s5, a1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB49_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 24
-; RV32I-NEXT:  .LBB49_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lb a0, 4(a0)
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB49_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB49_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_s_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB49_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai s0, a0, 56
+; RV64I-NEXT:    lb a0, 4(a0)
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB49_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB49_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_s_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    mv s5, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    slli a0, a0, 24
-; RV32I-SFB-NEXT:    beqz s5, .LBB49_2
+; RV32I-SFB-NEXT:    lb a0, 4(a0)
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    beqz a1, .LBB49_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s4, a0, 24
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB49_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB49_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB49_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB49_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_s_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    slli a0, a0, 56
-; RV64I-SFB-NEXT:    beqz s3, .LBB49_2
+; RV64I-SFB-NEXT:    lb a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB49_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 56
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB49_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_s_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    mv s5, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 24
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB49_2
+; RV32I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB49_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s4, a0, 24
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB49_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB49_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB49_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB49_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_s_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 56
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB49_2
+; RV64I-SFBILOAD-NEXT:    lb a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB49_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 56
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB49_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -5419,209 +3805,77 @@ entry:
 define i64 @test_i8_z_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i8_z_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s3, a6
-; RV32I-NEXT:    mv s4, a5
-; RV32I-NEXT:    mv s5, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a0, a0, 4
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_1
-; RV32I-NEXT:    sw s4, 0(s5)
-; RV32I-NEXT:    sw s3, 4(s5)
-; RV32I-NEXT:    beqz s2, .LBB50_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    zext.b s1, a0
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB50_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB50_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i8_z_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 4
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_1
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB50_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    zext.b s0, a0
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB50_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB50_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i8_z_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    mv s5, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 4
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_1
-; RV32I-SFB-NEXT:    beqz s5, .LBB50_2
+; RV32I-SFB-NEXT:    lbu a0, 4(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB50_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB50_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB50_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB50_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    zext.b s4, a0
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB50_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i8_z_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 4
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_1
-; RV64I-SFB-NEXT:    beqz s3, .LBB50_2
+; RV64I-SFB-NEXT:    lbu a0, 4(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB50_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    zext.b s2, a0
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB50_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i8_z_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    mv s5, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB50_2
+; RV32I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB50_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB50_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB50_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB50_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    zext.b s4, a0
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB50_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i8_z_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 4
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_1
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB50_2
+; RV64I-SFBILOAD-NEXT:    lbu a0, 4(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB50_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    zext.b s2, a0
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB50_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i8, ptr %base, i64 4   ; compute base + 4
@@ -5635,215 +3889,80 @@ entry:
 define i64 @test_i16_s_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_s_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a6
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:    mv s4, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s5, a1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s3, 0(s4)
-; RV32I-NEXT:    sw s2, 4(s4)
-; RV32I-NEXT:    beqz s5, .LBB51_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    srai s1, a0, 16
-; RV32I-NEXT:  .LBB51_2: # %entry
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lh a0, 8(a0)
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB51_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB51_2:
+; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_s_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB51_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai s0, a0, 48
+; RV64I-NEXT:    lh a0, 8(a0)
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB51_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB51_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_s_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    mv s5, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s5, .LBB51_2
+; RV32I-SFB-NEXT:    lh a0, 8(a0)
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    beqz a1, .LBB51_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    srai s4, a0, 16
+; RV32I-SFB-NEXT:    mv a2, a0
 ; RV32I-SFB-NEXT:  .LBB51_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB51_4
+; RV32I-SFB-NEXT:    beqz a1, .LBB51_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srai s3, a0, 31
+; RV32I-SFB-NEXT:    srai a3, a0, 31
 ; RV32I-SFB-NEXT:  .LBB51_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_s_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB51_2
+; RV64I-SFB-NEXT:    lh a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB51_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srai s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB51_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_s_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    mv s5, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB51_2
+; RV32I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB51_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s4, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a2, a0
 ; RV32I-SFBILOAD-NEXT:  .LBB51_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB51_4
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB51_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srai s3, a0, 31
+; RV32I-SFBILOAD-NEXT:    srai a3, a0, 31
 ; RV32I-SFBILOAD-NEXT:  .LBB51_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_s_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB51_2
+; RV64I-SFBILOAD-NEXT:    lh a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB51_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srai s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB51_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -5857,215 +3976,77 @@ entry:
 define i64 @test_i16_z_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i16_z_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s3, a6
-; RV32I-NEXT:    mv s4, a5
-; RV32I-NEXT:    mv s5, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    addi a0, a0, 8
-; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    call __atomic_load_2
-; RV32I-NEXT:    sw s4, 0(s5)
-; RV32I-NEXT:    sw s3, 4(s5)
-; RV32I-NEXT:    beqz s2, .LBB52_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srli s1, a0, 16
+; RV32I-NEXT:    lhu a0, 8(a0)
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB52_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB52_2: # %entry
-; RV32I-NEXT:    addi a1, s2, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i16_z_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 8
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_2
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    beqz s3, .LBB52_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli s0, a0, 48
+; RV64I-NEXT:    lhu a0, 8(a0)
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB52_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB52_2: # %entry
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i16_z_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    mv s5, a1
-; RV32I-SFB-NEXT:    addi a0, a0, 8
-; RV32I-SFB-NEXT:    li a1, 0
-; RV32I-SFB-NEXT:    call __atomic_load_2
-; RV32I-SFB-NEXT:    slli a0, a0, 16
-; RV32I-SFB-NEXT:    beqz s5, .LBB52_2
+; RV32I-SFB-NEXT:    lhu a0, 8(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB52_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB52_2: # %entry
-; RV32I-SFB-NEXT:    beqz s5, .LBB52_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB52_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    srli s4, a0, 16
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB52_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a0, s4
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i16_z_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 8
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_2
-; RV64I-SFB-NEXT:    slli a0, a0, 48
-; RV64I-SFB-NEXT:    beqz s3, .LBB52_2
+; RV64I-SFB-NEXT:    lhu a0, 8(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB52_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    srli s2, a0, 48
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB52_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    mv a0, s2
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i16_z_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    mv s5, a1
-; RV32I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV32I-SFBILOAD-NEXT:    li a1, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV32I-SFBILOAD-NEXT:    slli a0, a0, 16
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB52_2
+; RV32I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB52_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB52_2: # %entry
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB52_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB52_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    srli s4, a0, 16
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB52_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i16_z_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 8
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_2
-; RV64I-SFBILOAD-NEXT:    slli a0, a0, 48
-; RV64I-SFBILOAD-NEXT:    beqz s3, .LBB52_2
+; RV64I-SFBILOAD-NEXT:    lhu a0, 8(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB52_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    srli s2, a0, 48
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB52_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i16, ptr %base, i64 4   ; compute base + 4
@@ -6079,225 +4060,81 @@ entry:
 define i64 @test_i32_z_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i64 %c) nounwind {
 ; RV32I-LABEL: test_i32_z_store_64_2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s3, a6
-; RV32I-NEXT:    mv s4, a5
-; RV32I-NEXT:    mv s5, a4
-; RV32I-NEXT:    mv s0, a3
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    addi a1, a0, 16
-; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    mv a2, sp
-; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    call __atomic_load
-; RV32I-NEXT:    lw a0, 0(sp)
-; RV32I-NEXT:    sw s4, 0(s5)
-; RV32I-NEXT:    sw s3, 4(s5)
-; RV32I-NEXT:    bnez s1, .LBB53_2
+; RV32I-NEXT:    lw a0, 16(a0)
+; RV32I-NEXT:    sw a5, 0(a4)
+; RV32I-NEXT:    sw a6, 4(a4)
+; RV32I-NEXT:    bnez a1, .LBB53_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB53_2: # %entry
-; RV32I-NEXT:    addi a1, s1, -1
-; RV32I-NEXT:    and a1, a1, s0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_i32_z_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a1, a0, 16
-; RV64I-NEXT:    li a0, 4
-; RV64I-NEXT:    addi a2, sp, 4
-; RV64I-NEXT:    li a3, 0
-; RV64I-NEXT:    call __atomic_load
-; RV64I-NEXT:    lwu a0, 4(sp)
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB53_2
+; RV64I-NEXT:    lwu a0, 16(a0)
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB53_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB53_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i32_z_store_64_2:
 ; RV32I-SFB:       # %bb.0: # %entry
-; RV32I-SFB-NEXT:    addi sp, sp, -32
-; RV32I-SFB-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFB-NEXT:    mv s0, a6
-; RV32I-SFB-NEXT:    mv s1, a5
-; RV32I-SFB-NEXT:    mv s2, a4
-; RV32I-SFB-NEXT:    mv s3, a3
-; RV32I-SFB-NEXT:    mv s4, a2
-; RV32I-SFB-NEXT:    mv s5, a1
-; RV32I-SFB-NEXT:    addi a1, a0, 16
-; RV32I-SFB-NEXT:    li a0, 4
-; RV32I-SFB-NEXT:    mv a2, sp
-; RV32I-SFB-NEXT:    li a3, 0
-; RV32I-SFB-NEXT:    call __atomic_load
-; RV32I-SFB-NEXT:    lw a0, 0(sp)
-; RV32I-SFB-NEXT:    beqz s5, .LBB53_2
+; RV32I-SFB-NEXT:    lw a0, 16(a0)
+; RV32I-SFB-NEXT:    beqz a1, .LBB53_2
 ; RV32I-SFB-NEXT:  # %bb.1: # %entry
-; RV32I-SFB-NEXT:    li s3, 0
+; RV32I-SFB-NEXT:    li a3, 0
 ; RV32I-SFB-NEXT:  .LBB53_2: # %entry
-; RV32I-SFB-NEXT:    bnez s5, .LBB53_4
+; RV32I-SFB-NEXT:    bnez a1, .LBB53_4
 ; RV32I-SFB-NEXT:  # %bb.3: # %entry
-; RV32I-SFB-NEXT:    mv a0, s4
+; RV32I-SFB-NEXT:    mv a0, a2
 ; RV32I-SFB-NEXT:  .LBB53_4: # %entry
-; RV32I-SFB-NEXT:    sw s1, 0(s2)
-; RV32I-SFB-NEXT:    sw s0, 4(s2)
-; RV32I-SFB-NEXT:    mv a1, s3
-; RV32I-SFB-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFB-NEXT:    addi sp, sp, 32
+; RV32I-SFB-NEXT:    sw a5, 0(a4)
+; RV32I-SFB-NEXT:    sw a6, 4(a4)
+; RV32I-SFB-NEXT:    mv a1, a3
 ; RV32I-SFB-NEXT:    ret
 ;
 ; RV64I-SFB-LABEL: test_i32_z_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a1, a0, 16
-; RV64I-SFB-NEXT:    li a0, 4
-; RV64I-SFB-NEXT:    addi a2, sp, 4
-; RV64I-SFB-NEXT:    li a3, 0
-; RV64I-SFB-NEXT:    call __atomic_load
-; RV64I-SFB-NEXT:    lwu a0, 4(sp)
-; RV64I-SFB-NEXT:    bnez s3, .LBB53_2
+; RV64I-SFB-NEXT:    lwu a0, 16(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB53_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB53_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i32_z_store_64_2:
 ; RV32I-SFBILOAD:       # %bb.0: # %entry
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, -32
-; RV32I-SFBILOAD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-SFBILOAD-NEXT:    mv s0, a6
-; RV32I-SFBILOAD-NEXT:    mv s1, a5
-; RV32I-SFBILOAD-NEXT:    mv s2, a4
-; RV32I-SFBILOAD-NEXT:    mv s3, a3
-; RV32I-SFBILOAD-NEXT:    mv s4, a2
-; RV32I-SFBILOAD-NEXT:    mv s5, a1
-; RV32I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV32I-SFBILOAD-NEXT:    li a0, 4
-; RV32I-SFBILOAD-NEXT:    mv a2, sp
-; RV32I-SFBILOAD-NEXT:    li a3, 0
-; RV32I-SFBILOAD-NEXT:    call __atomic_load
-; RV32I-SFBILOAD-NEXT:    lw a0, 0(sp)
-; RV32I-SFBILOAD-NEXT:    beqz s5, .LBB53_2
+; RV32I-SFBILOAD-NEXT:    lw a0, 16(a0)
+; RV32I-SFBILOAD-NEXT:    beqz a1, .LBB53_2
 ; RV32I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV32I-SFBILOAD-NEXT:    li s3, 0
+; RV32I-SFBILOAD-NEXT:    li a3, 0
 ; RV32I-SFBILOAD-NEXT:  .LBB53_2: # %entry
-; RV32I-SFBILOAD-NEXT:    bnez s5, .LBB53_4
+; RV32I-SFBILOAD-NEXT:    bnez a1, .LBB53_4
 ; RV32I-SFBILOAD-NEXT:  # %bb.3: # %entry
-; RV32I-SFBILOAD-NEXT:    mv a0, s4
+; RV32I-SFBILOAD-NEXT:    mv a0, a2
 ; RV32I-SFBILOAD-NEXT:  .LBB53_4: # %entry
-; RV32I-SFBILOAD-NEXT:    sw s1, 0(s2)
-; RV32I-SFBILOAD-NEXT:    sw s0, 4(s2)
-; RV32I-SFBILOAD-NEXT:    mv a1, s3
-; RV32I-SFBILOAD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-SFBILOAD-NEXT:    addi sp, sp, 32
+; RV32I-SFBILOAD-NEXT:    sw a5, 0(a4)
+; RV32I-SFBILOAD-NEXT:    sw a6, 4(a4)
+; RV32I-SFBILOAD-NEXT:    mv a1, a3
 ; RV32I-SFBILOAD-NEXT:    ret
 ;
 ; RV64I-SFBILOAD-LABEL: test_i32_z_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a1, a0, 16
-; RV64I-SFBILOAD-NEXT:    li a0, 4
-; RV64I-SFBILOAD-NEXT:    addi a2, sp, 4
-; RV64I-SFBILOAD-NEXT:    li a3, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load
-; RV64I-SFBILOAD-NEXT:    lwu a0, 4(sp)
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB53_2
+; RV64I-SFBILOAD-NEXT:    lwu a0, 16(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB53_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB53_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i32, ptr %base, i64 4   ; compute base + 4
-  %val = load atomic i32, ptr %addr monotonic, align 2          ; load 32-bit value
+  %val = load atomic i32, ptr %addr monotonic, align 4          ; load 32-bit value
   %ext = zext i32 %val to i64         ; zero-extend to 64 bits
   store i64 %c, ptr %base1
   %res = select i1 %x, i64 %ext, i64 %b
@@ -6343,30 +4180,12 @@ define i64 @test_i64_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i6
 ;
 ; RV64I-LABEL: test_i64_store_64_2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a4
-; RV64I-NEXT:    mv s2, a3
-; RV64I-NEXT:    mv s0, a2
-; RV64I-NEXT:    mv s3, a1
-; RV64I-NEXT:    addi a0, a0, 32
-; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    call __atomic_load_8
-; RV64I-NEXT:    sd s1, 0(s2)
-; RV64I-NEXT:    bnez s3, .LBB54_2
+; RV64I-NEXT:    ld a0, 32(a0)
+; RV64I-NEXT:    sd a4, 0(a3)
+; RV64I-NEXT:    bnez a1, .LBB54_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:  .LBB54_2: # %entry
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-SFB-LABEL: test_i64_store_64_2:
@@ -6410,30 +4229,12 @@ define i64 @test_i64_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i6
 ;
 ; RV64I-SFB-LABEL: test_i64_store_64_2:
 ; RV64I-SFB:       # %bb.0: # %entry
-; RV64I-SFB-NEXT:    addi sp, sp, -48
-; RV64I-SFB-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFB-NEXT:    mv s0, a4
-; RV64I-SFB-NEXT:    mv s1, a3
-; RV64I-SFB-NEXT:    mv s2, a2
-; RV64I-SFB-NEXT:    mv s3, a1
-; RV64I-SFB-NEXT:    addi a0, a0, 32
-; RV64I-SFB-NEXT:    li a1, 0
-; RV64I-SFB-NEXT:    call __atomic_load_8
-; RV64I-SFB-NEXT:    bnez s3, .LBB54_2
+; RV64I-SFB-NEXT:    ld a0, 32(a0)
+; RV64I-SFB-NEXT:    bnez a1, .LBB54_2
 ; RV64I-SFB-NEXT:  # %bb.1: # %entry
-; RV64I-SFB-NEXT:    mv a0, s2
+; RV64I-SFB-NEXT:    mv a0, a2
 ; RV64I-SFB-NEXT:  .LBB54_2: # %entry
-; RV64I-SFB-NEXT:    sd s0, 0(s1)
-; RV64I-SFB-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFB-NEXT:    addi sp, sp, 48
+; RV64I-SFB-NEXT:    sd a4, 0(a3)
 ; RV64I-SFB-NEXT:    ret
 ;
 ; RV32I-SFBILOAD-LABEL: test_i64_store_64_2:
@@ -6477,30 +4278,12 @@ define i64 @test_i64_store_64_2(ptr %base, i1 zeroext %x, i64 %b, ptr %base1, i6
 ;
 ; RV64I-SFBILOAD-LABEL: test_i64_store_64_2:
 ; RV64I-SFBILOAD:       # %bb.0: # %entry
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, -48
-; RV64I-SFBILOAD-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-SFBILOAD-NEXT:    mv s0, a4
-; RV64I-SFBILOAD-NEXT:    mv s1, a3
-; RV64I-SFBILOAD-NEXT:    mv s2, a2
-; RV64I-SFBILOAD-NEXT:    mv s3, a1
-; RV64I-SFBILOAD-NEXT:    addi a0, a0, 32
-; RV64I-SFBILOAD-NEXT:    li a1, 0
-; RV64I-SFBILOAD-NEXT:    call __atomic_load_8
-; RV64I-SFBILOAD-NEXT:    bnez s3, .LBB54_2
+; RV64I-SFBILOAD-NEXT:    ld a0, 32(a0)
+; RV64I-SFBILOAD-NEXT:    bnez a1, .LBB54_2
 ; RV64I-SFBILOAD-NEXT:  # %bb.1: # %entry
-; RV64I-SFBILOAD-NEXT:    mv a0, s2
+; RV64I-SFBILOAD-NEXT:    mv a0, a2
 ; RV64I-SFBILOAD-NEXT:  .LBB54_2: # %entry
-; RV64I-SFBILOAD-NEXT:    sd s0, 0(s1)
-; RV64I-SFBILOAD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-SFBILOAD-NEXT:    addi sp, sp, 48
+; RV64I-SFBILOAD-NEXT:    sd a4, 0(a3)
 ; RV64I-SFBILOAD-NEXT:    ret
 entry:
   %addr = getelementptr i64, ptr %base, i64 4   ; compute base + 4

>From 633230a1f3a9bca8c9ce4848d6da6fcce1fb5649 Mon Sep 17 00:00:00 2001
From: Harsh Chandel <hchandel at qti.qualcomm.com>
Date: Wed, 10 Dec 2025 09:50:37 +0530
Subject: [PATCH 11/11] fixup! Address comments

Change-Id: I24e728250783c8331d12882ab8198e3824c819b2
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 27b9f45877c40..489cf96f38750 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -897,7 +897,7 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
       .addImm(0);
 }
 
-unsigned getLoadPredicatedOpcode(unsigned Opcode) {
+static unsigned getLoadPredicatedOpcode(unsigned Opcode) {
   switch (Opcode) {
   case RISCV::LB:
     return RISCV::PseudoCCLB;
@@ -941,11 +941,8 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
 
   // Create a new predicated version of DefMI.
   MachineInstrBuilder NewMI = BuildMI(*MI.getParent(), InsertPt,
-                                      MI.getDebugLoc(), get(PredOpc), DestReg);
-
-  // Copy the condition portion.
-  NewMI.add(MI.getOperand(1));
-  NewMI.add(MI.getOperand(2));
+                                      MI.getDebugLoc(), get(PredOpc), DestReg)
+                                  .add({MI.getOperand(1), MI.getOperand(2)});
 
   // Add condition code, inverting if necessary.
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());



More information about the llvm-commits mailing list