[llvm] dde2a7f - [RISCV] Exploit fact that vscale is always power of two to replace urem sequence
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 13 10:55:17 PDT 2022
Author: Philip Reames
Date: 2022-07-13T10:54:47-07:00
New Revision: dde2a7fb6da46da2b2f765fa569d8fddb4270eb6
URL: https://github.com/llvm/llvm-project/commit/dde2a7fb6da46da2b2f765fa569d8fddb4270eb6
DIFF: https://github.com/llvm/llvm-project/commit/dde2a7fb6da46da2b2f765fa569d8fddb4270eb6.diff
LOG: [RISCV] Exploit fact that vscale is always power of two to replace urem sequence
When doing scalable vectorization, the loop vectorizer uses a urem in the computation of the vector trip count. The RHS of that urem is a (possibly shifted) call to @llvm.vscale.
vscale is effectively the number of "blocks" in the vector register. (That is, types such as <vscale x 8 x i8> and <vscale x 1 x i8> both fill one 64 bit block, and vscale is essentially how many of those blocks there are in a single vector register at runtime.)
We know from the RISCV V extension specification that VLEN must be a power of two between ELEN and 2^16. Since our block size is 64 bits, the must be a power of two numbers of blocks. (For everything other than VLEN<=32, but that's already broken.)
It is worth noting that AArch64 SVE specification explicitly allows non-power-of-two sizes for the vector registers and thus can't claim that vscale is a power of two by this logic.
Differential Revision: https://reviews.llvm.org/D129609
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 98b9a416ea59a..ab5d3ba0164d2 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -546,6 +546,9 @@ class TargetLoweringBase {
return BypassSlowDivWidths;
}
+ /// Return true only if vscale must be a power of two.
+ virtual bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
+
/// Return true if Flow Control is an expensive operation that should be
/// avoided.
bool isJumpExpensive() const { return JumpIsExpensive; }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 832fc3a564adf..c8d0f5faf6471 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3869,6 +3869,12 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
if (C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2())
return true;
+ // vscale(power-of-two) is a power-of-two for some targets
+ if (Val.getOpcode() == ISD::VSCALE &&
+ getTargetLoweringInfo().isVScaleKnownToBeAPowerOfTwo() &&
+ isKnownToBeAPowerOfTwo(Val.getOperand(0)))
+ return true;
+
// More could be done here, though the above checks are enough
// to handle some common cases.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0ee88bfba2709..9481ff704806e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12130,6 +12130,17 @@ const MCExpr *RISCVTargetLowering::LowerCustomJumpTableEntry(
return MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
}
+bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
+ // We define vscale to be VLEN/RVVBitsPerBlock. VLEN is always a power
+ // of two >= 64, and RVVBitsPerBlock is 64. Thus, vscale must be
+ // a power of two as well.
+ // FIXME: This doesn't work for zve32, but that's already broken
+ // elsewhere for the same reason.
+ assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported");
+ assert(RISCV::RVVBitsPerBlock == 64 && "RVVBitsPerBlock changed, audit needed");
+ return true;
+}
+
bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
VT = VT.getScalarType();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 1af34d511b463..5e15176de59ca 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -597,6 +597,8 @@ class RISCVTargetLowering : public TargetLowering {
unsigned uid,
MCContext &Ctx) const override;
+ bool isVScaleKnownToBeAPowerOfTwo() const override;
+
private:
/// RISCVCCAssignFn - This target-specific function extends the default
/// CCValAssign with additional information used to lower RISC-V calling
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index ef476d0297d61..0aaa83773a237 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -266,8 +266,9 @@ define void @sink_splat_mul_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB7_5
; CHECK-NEXT: .LBB7_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -358,8 +359,9 @@ define void @sink_splat_add_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB8_5
; CHECK-NEXT: .LBB8_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -450,8 +452,9 @@ define void @sink_splat_sub_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB9_5
; CHECK-NEXT: .LBB9_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -542,8 +545,9 @@ define void @sink_splat_rsub_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB10_5
; CHECK-NEXT: .LBB10_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -634,8 +638,9 @@ define void @sink_splat_and_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB11_5
; CHECK-NEXT: .LBB11_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -726,8 +731,9 @@ define void @sink_splat_or_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB12_5
; CHECK-NEXT: .LBB12_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -818,8 +824,9 @@ define void @sink_splat_xor_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB13_5
; CHECK-NEXT: .LBB13_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -1018,8 +1025,9 @@ define void @sink_splat_shl_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB17_5
; CHECK-NEXT: .LBB17_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -1110,8 +1118,9 @@ define void @sink_splat_lshr_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB18_5
; CHECK-NEXT: .LBB18_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -1202,8 +1211,9 @@ define void @sink_splat_ashr_scalable(i32* nocapture %a) {
; CHECK-NEXT: j .LBB19_5
; CHECK-NEXT: .LBB19_2: # %vector.ph
; CHECK-NEXT: li a5, 0
-; CHECK-NEXT: remu a3, a2, a1
-; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: addiw a2, a1, -1
+; CHECK-NEXT: andi a3, a2, 1024
+; CHECK-NEXT: xori a2, a3, 1024
; CHECK-NEXT: slli a4, a4, 1
; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a6, a0
@@ -1510,8 +1520,9 @@ define void @sink_splat_fmul_scalable(float* nocapture %a, float %x) {
; CHECK-NEXT: j .LBB26_5
; CHECK-NEXT: .LBB26_2: # %vector.ph
; CHECK-NEXT: li a5, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu
; CHECK-NEXT: mv a6, a0
; CHECK-NEXT: .LBB26_3: # %vector.body
@@ -1601,8 +1612,9 @@ define void @sink_splat_fdiv_scalable(float* nocapture %a, float %x) {
; CHECK-NEXT: j .LBB27_5
; CHECK-NEXT: .LBB27_2: # %vector.ph
; CHECK-NEXT: li a5, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu
; CHECK-NEXT: mv a6, a0
; CHECK-NEXT: .LBB27_3: # %vector.body
@@ -1692,8 +1704,9 @@ define void @sink_splat_frdiv_scalable(float* nocapture %a, float %x) {
; CHECK-NEXT: j .LBB28_5
; CHECK-NEXT: .LBB28_2: # %vector.ph
; CHECK-NEXT: li a5, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu
; CHECK-NEXT: mv a6, a0
; CHECK-NEXT: .LBB28_3: # %vector.body
@@ -1783,8 +1796,9 @@ define void @sink_splat_fadd_scalable(float* nocapture %a, float %x) {
; CHECK-NEXT: j .LBB29_5
; CHECK-NEXT: .LBB29_2: # %vector.ph
; CHECK-NEXT: li a5, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu
; CHECK-NEXT: mv a6, a0
; CHECK-NEXT: .LBB29_3: # %vector.body
@@ -1874,8 +1888,9 @@ define void @sink_splat_fsub_scalable(float* nocapture %a, float %x) {
; CHECK-NEXT: j .LBB30_5
; CHECK-NEXT: .LBB30_2: # %vector.ph
; CHECK-NEXT: li a5, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu
; CHECK-NEXT: mv a6, a0
; CHECK-NEXT: .LBB30_3: # %vector.body
@@ -1965,8 +1980,9 @@ define void @sink_splat_frsub_scalable(float* nocapture %a, float %x) {
; CHECK-NEXT: j .LBB31_5
; CHECK-NEXT: .LBB31_2: # %vector.ph
; CHECK-NEXT: li a5, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu
; CHECK-NEXT: mv a6, a0
; CHECK-NEXT: .LBB31_3: # %vector.body
@@ -2139,8 +2155,9 @@ define void @sink_splat_fma_scalable(float* noalias nocapture %a, float* noalias
; CHECK-NEXT: .LBB34_2: # %vector.ph
; CHECK-NEXT: li a6, 0
; CHECK-NEXT: li a7, 0
-; CHECK-NEXT: remu a5, a4, a3
-; CHECK-NEXT: sub a4, a4, a5
+; CHECK-NEXT: addiw a4, a3, -1
+; CHECK-NEXT: andi a5, a4, 1024
+; CHECK-NEXT: xori a4, a5, 1024
; CHECK-NEXT: vsetvli t0, zero, e32, m1, ta, mu
; CHECK-NEXT: .LBB34_3: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
@@ -2241,8 +2258,9 @@ define void @sink_splat_fma_commute_scalable(float* noalias nocapture %a, float*
; CHECK-NEXT: .LBB35_2: # %vector.ph
; CHECK-NEXT: li a6, 0
; CHECK-NEXT: li a7, 0
-; CHECK-NEXT: remu a5, a4, a3
-; CHECK-NEXT: sub a4, a4, a5
+; CHECK-NEXT: addiw a4, a3, -1
+; CHECK-NEXT: andi a5, a4, 1024
+; CHECK-NEXT: xori a4, a5, 1024
; CHECK-NEXT: vsetvli t0, zero, e32, m1, ta, mu
; CHECK-NEXT: .LBB35_3: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
@@ -2567,8 +2585,9 @@ define void @sink_splat_udiv_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB42_5
; CHECK-NEXT: .LBB42_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -2659,8 +2678,9 @@ define void @sink_splat_sdiv_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB43_5
; CHECK-NEXT: .LBB43_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -2751,8 +2771,9 @@ define void @sink_splat_urem_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB44_5
; CHECK-NEXT: .LBB44_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
@@ -2843,8 +2864,9 @@ define void @sink_splat_srem_scalable(i32* nocapture %a, i32 signext %x) {
; CHECK-NEXT: j .LBB45_5
; CHECK-NEXT: .LBB45_2: # %vector.ph
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: remu a4, a3, a2
-; CHECK-NEXT: sub a3, a3, a4
+; CHECK-NEXT: addiw a3, a2, -1
+; CHECK-NEXT: andi a4, a3, 1024
+; CHECK-NEXT: xori a3, a4, 1024
; CHECK-NEXT: slli a5, a5, 1
; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu
; CHECK-NEXT: mv a7, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
index 3d42224655169..919f33a2b18c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
@@ -8,7 +8,8 @@ define i64 @vscale_lshr(i64 %TC) {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 6
-; CHECK-NEXT: remu a0, a0, a1
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%shifted = lshr i64 %vscale, 3
@@ -21,7 +22,8 @@ define i64 @vscale(i64 %TC) {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 3
-; CHECK-NEXT: remu a0, a0, a1
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%urem = urem i64 %TC, %vscale
@@ -32,7 +34,8 @@ define i64 @vscale_shl(i64 %TC) {
; CHECK-LABEL: vscale_shl:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: remu a0, a0, a1
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%shifted = shl i64 %vscale, 3
@@ -45,8 +48,8 @@ define i64 @TC_minus_rem(i64 %TC) {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 3
-; CHECK-NEXT: remu a1, a0, a1
-; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%urem = urem i64 %TC, %vscale
@@ -58,8 +61,8 @@ define i64 @TC_minus_rem_shl(i64 %TC) {
; CHECK-LABEL: TC_minus_rem_shl:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: remu a1, a0, a1
-; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%shifted = shl i64 %vscale, 3
@@ -73,9 +76,8 @@ define i64 @con1024_minus_rem() {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: li a1, 1024
-; CHECK-NEXT: remu a0, a1, a0
-; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: negw a0, a0
+; CHECK-NEXT: andi a0, a0, 1024
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%urem = urem i64 1024, %vscale
@@ -90,10 +92,10 @@ define i64 @con2048_minus_rem() {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: lui a1, 1
; CHECK-NEXT: addiw a1, a1, -2048
-; CHECK-NEXT: remu a0, a1, a0
-; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%urem = urem i64 2048, %vscale
More information about the llvm-commits
mailing list