[llvm] 6500268 - [RISCV][SDAG] Prefer ShortForwardBranch to lower sdiv by pow2 (#67364)

Fri Nov 10 05:38:51 PST 2023

Author: Yingwei Zheng
Date: 2023-11-10T21:38:47+08:00
New Revision: 650026897c38ffebabd8c8377596141c37b65699

URL: https://github.com/llvm/llvm-project/commit/650026897c38ffebabd8c8377596141c37b65699
DIFF: https://github.com/llvm/llvm-project/commit/650026897c38ffebabd8c8377596141c37b65699.diff

LOG: [RISCV][SDAG] Prefer ShortForwardBranch to lower sdiv by pow2 (#67364)

This patch lowers `sdiv x, +/-2**k` to `add + select + shift` when the
short forward branch optimization is enabled. The latter inst seq
performs faster than the seq generated by target-independent
DAGCombiner. This algorithm is described in ***Hacker's Delight***.

This patch also removes duplicate logic in the X86 and AArch64 backend.
But we cannot do this for the PowerPC backend since it generates a
special instruction `addze`.

Added: 
    llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/X86/X86ISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 58aad70c4bb36e6..862d8a2b3dc48bb 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4919,6 +4919,10 @@ class TargetLowering : public TargetLoweringBase {
                     SmallVectorImpl<SDNode *> &Created) const;
   SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
                     SmallVectorImpl<SDNode *> &Created) const;
+  // Build sdiv by power-of-2 with conditional move instructions
+  SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor,
+                                SelectionDAG &DAG,
+                                SmallVectorImpl<SDNode *> &Created) const;
 
   /// Targets may override this function to provide custom SDIV lowering for
   /// power-of-2 denominators.  If the target returns an empty SDValue, LLVM

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 80f595ac4d4d9c0..ed352c86eca06e5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6081,6 +6081,49 @@ TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
   return SDValue();
 }
 
+/// Build sdiv by power-of-2 with conditional move instructions
+/// Ref: "Hacker's Delight" by Henry Warren 10-1
+/// If conditional move/branch is preferred, we lower sdiv x, +/-2**k into:
+///   bgez x, label
+///   add x, x, 2**k-1
+/// label:
+///   sra res, x, k
+///   neg res, res (when the divisor is negative)
+SDValue TargetLowering::buildSDIVPow2WithCMov(
+    SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+    SmallVectorImpl<SDNode *> &Created) const {
+  unsigned Lg2 = Divisor.countr_zero();
+  EVT VT = N->getValueType(0);
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
+  SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
+
+  // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Cmp = DAG.getSetCC(DL, CCVT, N0, Zero, ISD::SETLT);
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+  SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+
+  Created.push_back(Cmp.getNode());
+  Created.push_back(Add.getNode());
+  Created.push_back(CMov.getNode());
+
+  // Divide by pow2.
+  SDValue SRA =
+      DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, VT));
+
+  // If we're dividing by a positive value, we're done.  Otherwise, we must
+  // negate the result.
+  if (Divisor.isNonNegative())
+    return SRA;
+
+  Created.push_back(SRA.getNode());
+  return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
+}
+
 /// Given an ISD::SDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9b5c7381dfe8342..b61e4be705709e2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16356,33 +16356,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
       !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
     return SDValue();
 
-  SDLoc DL(N);
-  SDValue N0 = N->getOperand(0);
-  unsigned Lg2 = Divisor.countr_zero();
-  SDValue Zero = DAG.getConstant(0, DL, VT);
-  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
-
-  // Add (N0 < 0) ? Pow2 - 1 : 0;
-  SDValue CCVal;
-  SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
-  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
-  SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
-
-  Created.push_back(Cmp.getNode());
-  Created.push_back(Add.getNode());
-  Created.push_back(CSel.getNode());
-
-  // Divide by pow2.
-  SDValue SRA =
-      DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
-
-  // If we're dividing by a positive value, we're done.  Otherwise, we must
-  // negate the result.
-  if (Divisor.isNonNegative())
-    return SRA;
-
-  Created.push_back(SRA.getNode());
-  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
+  return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
 }
 
 SDValue

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8d13563eb138150..a4cd8327f45f82a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19654,6 +19654,26 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
   return false;
 }
 
+SDValue
+RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                   SelectionDAG &DAG,
+                                   SmallVectorImpl<SDNode *> &Created) const {
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N, 0); // Lower SDIV as SDIV
+
+  // Only perform this transform if short forward branch opt is supported.
+  if (!Subtarget.hasShortForwardBranchOpt())
+    return SDValue();
+  EVT VT = N->getValueType(0);
+  if (!(VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit())))
+    return SDValue();
+
+  // Ensure 2**k-1 < 2048 so that we can just emit a single addi/addiw.
+  if (Divisor.sgt(2048) || Divisor.slt(-2048))
+    return SDValue();
+  return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
+}
 namespace llvm::RISCVVIntrinsicsTable {
 
 #define GET_RISCVVIntrinsicsTable_IMPL

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index f5764e7d4ba8cfa..8f3ff4be22a2d1b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -956,6 +956,9 @@ class RISCVTargetLowering : public TargetLowering {
   /// For available scheduling models FDIV + two independent FMULs are much
   /// faster than two FDIVs.
   unsigned combineRepeatedFPDivisors() const override;
+
+  SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                        SmallVectorImpl<SDNode *> &Created) const override;
 };
 
 namespace RISCV {

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e6045a4de51ebe2..8a883ad26a78d96 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22648,38 +22648,12 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
       !(Subtarget.is64Bit() && VT == MVT::i64))
     return SDValue();
 
-  unsigned Lg2 = Divisor.countr_zero();
-
   // If the divisor is 2 or -2, the default expansion is better.
-  if (Lg2 == 1)
+  if (Divisor == 2 ||
+      Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
     return SDValue();
 
-  SDLoc DL(N);
-  SDValue N0 = N->getOperand(0);
-  SDValue Zero = DAG.getConstant(0, DL, VT);
-  APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
-  SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
-
-  // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
-  SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
-  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
-  SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
-
-  Created.push_back(Cmp.getNode());
-  Created.push_back(Add.getNode());
-  Created.push_back(CMov.getNode());
-
-  // Divide by pow2.
-  SDValue SRA =
-      DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
-
-  // If we're dividing by a positive value, we're done.  Otherwise, we must
-  // negate the result.
-  if (Divisor.isNonNegative())
-    return SRA;
-
-  Created.push_back(SRA.getNode());
-  return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
+  return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
 }
 
 /// Result of 'and' is compared against zero. Change to a BT node if possible.

diff  --git a/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll b/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
new file mode 100644
index 000000000000000..f7dda8288567876
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
@@ -0,0 +1,378 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+c,+m -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=CHECK,NOSFB %s
+; RUN: llc -mtriple=riscv64 -mcpu=sifive-u74 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=CHECK,SFB %s
+
+define signext i32 @sdiv2_32(i32 signext %0) {
+; NOSFB-LABEL: sdiv2_32:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srliw a1, a0, 31
+; NOSFB-NEXT:    add a0, a0, a1
+; NOSFB-NEXT:    sraiw a0, a0, 1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sdiv2_32:
+; SFB:       # %bb.0:
+; SFB-NEXT:    bgez a0, .LBB0_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a0, a0, 1
+; SFB-NEXT:  .LBB0_2:
+; SFB-NEXT:    sraiw a0, a0, 1
+; SFB-NEXT:    ret
+  %res = sdiv i32 %0, 2
+  ret i32 %res
+}
+
+define signext i32 @sdivneg2_32(i32 signext %0) {
+; NOSFB-LABEL: sdivneg2_32:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srliw a1, a0, 31
+; NOSFB-NEXT:    add a0, a0, a1
+; NOSFB-NEXT:    sraiw a0, a0, 1
+; NOSFB-NEXT:    neg a0, a0
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sdivneg2_32:
+; SFB:       # %bb.0:
+; SFB-NEXT:    bgez a0, .LBB1_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a0, a0, 1
+; SFB-NEXT:  .LBB1_2:
+; SFB-NEXT:    sraiw a0, a0, 1
+; SFB-NEXT:    neg a0, a0
+; SFB-NEXT:    ret
+  %res = sdiv i32 %0, -2
+  ret i32 %res
+}
+
+define i64 @sdiv2_64(i64 %0) {
+; NOSFB-LABEL: sdiv2_64:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srli a1, a0, 63
+; NOSFB-NEXT:    add a0, a0, a1
+; NOSFB-NEXT:    srai a0, a0, 1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sdiv2_64:
+; SFB:       # %bb.0:
+; SFB-NEXT:    bgez a0, .LBB2_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a0, a0, 1
+; SFB-NEXT:  .LBB2_2:
+; SFB-NEXT:    srai a0, a0, 1
+; SFB-NEXT:    ret
+  %res = sdiv i64 %0, 2
+  ret i64 %res
+}
+
+define i64 @sdivneg2_64(i64 %0) {
+; NOSFB-LABEL: sdivneg2_64:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srli a1, a0, 63
+; NOSFB-NEXT:    add a0, a0, a1
+; NOSFB-NEXT:    srai a0, a0, 1
+; NOSFB-NEXT:    neg a0, a0
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sdivneg2_64:
+; SFB:       # %bb.0:
+; SFB-NEXT:    bgez a0, .LBB3_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a0, a0, 1
+; SFB-NEXT:  .LBB3_2:
+; SFB-NEXT:    srai a0, a0, 1
+; SFB-NEXT:    neg a0, a0
+; SFB-NEXT:    ret
+  %res = sdiv i64 %0, -2
+  ret i64 %res
+}
+
+define signext i32 @srem2_32(i32 signext %0) {
+; NOSFB-LABEL: srem2_32:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srliw a1, a0, 31
+; NOSFB-NEXT:    add a1, a1, a0
+; NOSFB-NEXT:    andi a1, a1, -2
+; NOSFB-NEXT:    subw a0, a0, a1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: srem2_32:
+; SFB:       # %bb.0:
+; SFB-NEXT:    mv a1, a0
+; SFB-NEXT:    bgez a0, .LBB4_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a1, a0, 1
+; SFB-NEXT:  .LBB4_2:
+; SFB-NEXT:    andi a1, a1, -2
+; SFB-NEXT:    subw a0, a0, a1
+; SFB-NEXT:    ret
+  %res = srem i32 %0, 2
+  ret i32 %res
+}
+
+define signext i32 @sremneg2_32(i32 signext %0) {
+; NOSFB-LABEL: sremneg2_32:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srliw a1, a0, 31
+; NOSFB-NEXT:    add a1, a1, a0
+; NOSFB-NEXT:    andi a1, a1, -2
+; NOSFB-NEXT:    subw a0, a0, a1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sremneg2_32:
+; SFB:       # %bb.0:
+; SFB-NEXT:    mv a1, a0
+; SFB-NEXT:    bgez a0, .LBB5_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a1, a0, 1
+; SFB-NEXT:  .LBB5_2:
+; SFB-NEXT:    andi a1, a1, -2
+; SFB-NEXT:    subw a0, a0, a1
+; SFB-NEXT:    ret
+  %res = srem i32 %0, -2
+  ret i32 %res
+}
+
+define i64 @srem2_64(i64 %0) {
+; NOSFB-LABEL: srem2_64:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srli a1, a0, 63
+; NOSFB-NEXT:    add a1, a1, a0
+; NOSFB-NEXT:    andi a1, a1, -2
+; NOSFB-NEXT:    sub a0, a0, a1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: srem2_64:
+; SFB:       # %bb.0:
+; SFB-NEXT:    mv a1, a0
+; SFB-NEXT:    bgez a0, .LBB6_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a1, a0, 1
+; SFB-NEXT:  .LBB6_2:
+; SFB-NEXT:    andi a1, a1, -2
+; SFB-NEXT:    sub a0, a0, a1
+; SFB-NEXT:    ret
+  %res = srem i64 %0, 2
+  ret i64 %res
+}
+
+define i64 @sremneg2_64(i64 %0) {
+; NOSFB-LABEL: sremneg2_64:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srli a1, a0, 63
+; NOSFB-NEXT:    add a1, a1, a0
+; NOSFB-NEXT:    andi a1, a1, -2
+; NOSFB-NEXT:    sub a0, a0, a1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sremneg2_64:
+; SFB:       # %bb.0:
+; SFB-NEXT:    mv a1, a0
+; SFB-NEXT:    bgez a0, .LBB7_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a1, a0, 1
+; SFB-NEXT:  .LBB7_2:
+; SFB-NEXT:    andi a1, a1, -2
+; SFB-NEXT:    sub a0, a0, a1
+; SFB-NEXT:    ret
+  %res = srem i64 %0, -2
+  ret i64 %res
+}
+
+define signext i32 @sdiv8_32(i32 signext %0) {
+; NOSFB-LABEL: sdiv8_32:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    slli a1, a0, 1
+; NOSFB-NEXT:    srli a1, a1, 61
+; NOSFB-NEXT:    add a0, a0, a1
+; NOSFB-NEXT:    sraiw a0, a0, 3
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sdiv8_32:
+; SFB:       # %bb.0:
+; SFB-NEXT:    bgez a0, .LBB8_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a0, a0, 7
+; SFB-NEXT:  .LBB8_2:
+; SFB-NEXT:    sraiw a0, a0, 3
+; SFB-NEXT:    ret
+  %res = sdiv i32 %0, 8
+  ret i32 %res
+}
+
+define signext i32 @sdivneg8_32(i32 signext %0) {
+; NOSFB-LABEL: sdivneg8_32:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    slli a1, a0, 1
+; NOSFB-NEXT:    srli a1, a1, 61
+; NOSFB-NEXT:    add a0, a0, a1
+; NOSFB-NEXT:    sraiw a0, a0, 3
+; NOSFB-NEXT:    neg a0, a0
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sdivneg8_32:
+; SFB:       # %bb.0:
+; SFB-NEXT:    bgez a0, .LBB9_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a0, a0, 7
+; SFB-NEXT:  .LBB9_2:
+; SFB-NEXT:    sraiw a0, a0, 3
+; SFB-NEXT:    neg a0, a0
+; SFB-NEXT:    ret
+  %res = sdiv i32 %0, -8
+  ret i32 %res
+}
+
+define i64 @sdiv8_64(i64 %0) {
+; NOSFB-LABEL: sdiv8_64:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srai a1, a0, 63
+; NOSFB-NEXT:    srli a1, a1, 61
+; NOSFB-NEXT:    add a0, a0, a1
+; NOSFB-NEXT:    srai a0, a0, 3
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sdiv8_64:
+; SFB:       # %bb.0:
+; SFB-NEXT:    bgez a0, .LBB10_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a0, a0, 7
+; SFB-NEXT:  .LBB10_2:
+; SFB-NEXT:    srai a0, a0, 3
+; SFB-NEXT:    ret
+  %res = sdiv i64 %0, 8
+  ret i64 %res
+}
+
+define i64 @sdivneg8_64(i64 %0) {
+; NOSFB-LABEL: sdivneg8_64:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srai a1, a0, 63
+; NOSFB-NEXT:    srli a1, a1, 61
+; NOSFB-NEXT:    add a0, a0, a1
+; NOSFB-NEXT:    srai a0, a0, 3
+; NOSFB-NEXT:    neg a0, a0
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sdivneg8_64:
+; SFB:       # %bb.0:
+; SFB-NEXT:    bgez a0, .LBB11_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a0, a0, 7
+; SFB-NEXT:  .LBB11_2:
+; SFB-NEXT:    srai a0, a0, 3
+; SFB-NEXT:    neg a0, a0
+; SFB-NEXT:    ret
+  %res = sdiv i64 %0, -8
+  ret i64 %res
+}
+
+define signext i32 @srem8_32(i32 signext %0) {
+; NOSFB-LABEL: srem8_32:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    slli a1, a0, 1
+; NOSFB-NEXT:    srli a1, a1, 61
+; NOSFB-NEXT:    add a1, a1, a0
+; NOSFB-NEXT:    andi a1, a1, -8
+; NOSFB-NEXT:    subw a0, a0, a1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: srem8_32:
+; SFB:       # %bb.0:
+; SFB-NEXT:    mv a1, a0
+; SFB-NEXT:    bgez a0, .LBB12_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a1, a0, 7
+; SFB-NEXT:  .LBB12_2:
+; SFB-NEXT:    andi a1, a1, -8
+; SFB-NEXT:    subw a0, a0, a1
+; SFB-NEXT:    ret
+  %res = srem i32 %0, 8
+  ret i32 %res
+}
+
+define signext i32 @sremneg8_32(i32 signext %0) {
+; NOSFB-LABEL: sremneg8_32:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    slli a1, a0, 1
+; NOSFB-NEXT:    srli a1, a1, 61
+; NOSFB-NEXT:    add a1, a1, a0
+; NOSFB-NEXT:    andi a1, a1, -8
+; NOSFB-NEXT:    subw a0, a0, a1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sremneg8_32:
+; SFB:       # %bb.0:
+; SFB-NEXT:    mv a1, a0
+; SFB-NEXT:    bgez a0, .LBB13_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a1, a0, 7
+; SFB-NEXT:  .LBB13_2:
+; SFB-NEXT:    andi a1, a1, -8
+; SFB-NEXT:    subw a0, a0, a1
+; SFB-NEXT:    ret
+  %res = srem i32 %0, -8
+  ret i32 %res
+}
+
+define i64 @srem8_64(i64 %0) {
+; NOSFB-LABEL: srem8_64:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srai a1, a0, 63
+; NOSFB-NEXT:    srli a1, a1, 61
+; NOSFB-NEXT:    add a1, a1, a0
+; NOSFB-NEXT:    andi a1, a1, -8
+; NOSFB-NEXT:    sub a0, a0, a1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: srem8_64:
+; SFB:       # %bb.0:
+; SFB-NEXT:    mv a1, a0
+; SFB-NEXT:    bgez a0, .LBB14_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a1, a0, 7
+; SFB-NEXT:  .LBB14_2:
+; SFB-NEXT:    andi a1, a1, -8
+; SFB-NEXT:    sub a0, a0, a1
+; SFB-NEXT:    ret
+  %res = srem i64 %0, 8
+  ret i64 %res
+}
+
+define i64 @sremneg8_64(i64 %0) {
+; NOSFB-LABEL: sremneg8_64:
+; NOSFB:       # %bb.0:
+; NOSFB-NEXT:    srai a1, a0, 63
+; NOSFB-NEXT:    srli a1, a1, 61
+; NOSFB-NEXT:    add a1, a1, a0
+; NOSFB-NEXT:    andi a1, a1, -8
+; NOSFB-NEXT:    sub a0, a0, a1
+; NOSFB-NEXT:    ret
+;
+; SFB-LABEL: sremneg8_64:
+; SFB:       # %bb.0:
+; SFB-NEXT:    mv a1, a0
+; SFB-NEXT:    bgez a0, .LBB15_2
+; SFB-NEXT:  # %bb.1:
+; SFB-NEXT:    addi a1, a0, 7
+; SFB-NEXT:  .LBB15_2:
+; SFB-NEXT:    andi a1, a1, -8
+; SFB-NEXT:    sub a0, a0, a1
+; SFB-NEXT:    ret
+  %res = srem i64 %0, -8
+  ret i64 %res
+}
+
+; Negative tests
+define i64 @sdiv4096(i64 %0) {
+; CHECK-LABEL: sdiv4096:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srai a1, a0, 63
+; CHECK-NEXT:    srli a1, a1, 52
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    srai a0, a0, 12
+; CHECK-NEXT:    ret
+  %res = sdiv i64 %0, 4096
+  ret i64 %res
+}