[llvm] 11cf807 - [AArch64][CodeGen] Always use SVE (when enabled) to lower integer divides

Wed Feb 2 01:46:08 PST 2022

Author: David Sherwood
Date: 2022-02-02T09:46:02Z
New Revision: 11cf80779654f90faa4e44bda24c7eab049c4a3b

URL: https://github.com/llvm/llvm-project/commit/11cf80779654f90faa4e44bda24c7eab049c4a3b
DIFF: https://github.com/llvm/llvm-project/commit/11cf80779654f90faa4e44bda24c7eab049c4a3b.diff

LOG: [AArch64][CodeGen] Always use SVE (when enabled) to lower integer divides

This patch adds custom lowering support for ISD::SDIV and ISD::UDIV
when SVE is enabled, regardless of the minimum SVE vector length. We do
this because NEON simply does not have vector integer divide support, so
we want to take advantage of these instructions in SVE.

As part of this patch I've also simplified LowerToPredicatedOp to avoid
re-asking the same question about whether we should be using SVE for
fixed length vectors. Once we've made the decision to call
LowerToPredicatedOp, then we should simply assert we should be using SVE.

I've updated the 128-bit min SVE vector bits tests here:

  CodeGen/AArch64/sve-fixed-length-int-div.ll
  CodeGen/AArch64/sve-fixed-length-int-rem.ll

Differential Revision: https://reviews.llvm.org/D117871

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 850b03947906c..5c43d8a0d4bc6 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1331,6 +1331,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
 
+    // NEON doesn't support integer divides, but SVE does
+    for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
+                    MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+      setOperationAction(ISD::SDIV, VT, Custom);
+      setOperationAction(ISD::UDIV, VT, Custom);
+    }
+
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
     if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1363,26 +1370,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
       setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
       setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
-      setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
-      setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
-      setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
-      setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
-      setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
-      setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
-      setOperationAction(ISD::SDIV, MVT::v1i64, Custom);
-      setOperationAction(ISD::SDIV, MVT::v2i64, Custom);
       setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
       setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
       setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
       setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
-      setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
-      setOperationAction(ISD::UDIV, MVT::v16i8, Custom);
-      setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
-      setOperationAction(ISD::UDIV, MVT::v8i16, Custom);
-      setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
-      setOperationAction(ISD::UDIV, MVT::v4i32, Custom);
-      setOperationAction(ISD::UDIV, MVT::v1i64, Custom);
-      setOperationAction(ISD::UDIV, MVT::v2i64, Custom);
       setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
       setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
       setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
@@ -3956,7 +3947,7 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
 
   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
 
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
@@ -5157,11 +5148,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::MUL:
     return LowerMUL(Op, DAG);
   case ISD::MULHS:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED,
-                               /*OverrideNEON=*/true);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
   case ISD::MULHU:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
-                               /*OverrideNEON=*/true);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
   case ISD::INTRINSIC_W_CHAIN:
     return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
@@ -5252,8 +5241,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::BSWAP:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
   case ISD::CTLZ:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
-                               /*OverrideNEON=*/true);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
   case ISD::CTTZ:
     return LowerCTTZ(Op, DAG);
   case ISD::VECTOR_SPLICE:
@@ -7514,17 +7502,13 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
     default:
       llvm_unreachable("Wrong instruction");
     case ISD::SMAX:
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
-                                 /*OverrideNEON=*/true);
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
     case ISD::SMIN:
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
-                                 /*OverrideNEON=*/true);
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
     case ISD::UMAX:
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
-                                 /*OverrideNEON=*/true);
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
     case ISD::UMIN:
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
-                                 /*OverrideNEON=*/true);
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
     }
   }
 
@@ -7540,8 +7524,7 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
 
   if (VT.isScalableVector() ||
       useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
-                               true);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
 
   SDLoc DL(Op);
   SDValue REVB;
@@ -11189,7 +11172,7 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
 
-  if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
+  if (VT.isFixedLengthVector() && Subtarget->hasSVE())
     return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
 
   assert(VT.isScalableVector() && "Expected a scalable vector.");
@@ -19224,7 +19207,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
 
   // Scalable vector i32/i64 DIV is supported.
   if (EltVT == MVT::i32 || EltVT == MVT::i64)
-    return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
+    return LowerToPredicatedOp(Op, DAG, PredOpcode);
 
   // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
@@ -19379,13 +19362,14 @@ SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
 // NOTE: The results for inactive lanes are undefined.
 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
                                                    SelectionDAG &DAG,
-                                                   unsigned NewOp,
-                                                   bool OverrideNEON) const {
+                                                   unsigned NewOp) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   auto Pg = getPredicateForVector(DAG, DL, VT);
 
-  if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
+  if (VT.isFixedLengthVector()) {
+    assert(VT.getFixedSizeInBits() <= Subtarget->getMinSVEVectorSizeInBits() &&
+           "Cannot use SVE to lower fixed length predicated op!");
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
 
     // Create list of operands by converting existing ones to scalable types.
@@ -19403,7 +19387,8 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
         continue;
       }
 
-      assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
+      assert(V.getValueType().getFixedSizeInBits() <=
+                 Subtarget->getMinSVEVectorSizeInBits() &&
              "Only fixed length vectors are supported!");
       Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
     }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2138c0ffe70ae..03d00302c56ba 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -980,8 +980,8 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp,
-                              bool OverrideNEON = false) const;
+  SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG,
+                              unsigned NewOp) const;
   SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index 537670c2e5677..f95860d55a401 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=VBITS_EQ_128
 ; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
 ; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
@@ -17,14 +17,12 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; SDIV
 ;
 
 ; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-LABEL: sdiv_v8i8:
 ; CHECK: ptrue [[PG0:p[0-9]+]].s, vl8
@@ -51,6 +49,21 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7]
 ; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: sdiv_v8i8:
+; VBITS_EQ_128:         sshll v1.8h, v1.8b, #0
+; VBITS_EQ_128-NEXT:    sshll v0.8h, v0.8b, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_EQ_128-NEXT:    sunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_EQ_128-NEXT:    xtn v0.8b, v0.8h
+; VBITS_EQ_128-NEXT:    ret
+
   %res = sdiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -81,6 +94,30 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_512-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
 ; VBITS_GE_512-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: sdiv_v16i8:
+; VBITS_EQ_128:         sunpkhi z2.h, z1.b
+; VBITS_EQ_128-NEXT:    sunpkhi z3.h, z0.b
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sunpklo z1.h, z1.b
+; VBITS_EQ_128-NEXT:    sunpkhi z4.s, z2.h
+; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z3.h
+; VBITS_EQ_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_EQ_128-NEXT:    sunpklo z0.h, z0.b
+; VBITS_EQ_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z1.h
+; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z0.h
+; VBITS_EQ_128-NEXT:    sunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_EQ_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
+; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    uzp1 z1.h, z2.h, z4.h
+; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z3.h
+; VBITS_EQ_128-NEXT:    uzp1 z0.b, z0.b, z1.b
+; VBITS_EQ_128-NEXT:    ret
+
   %res = sdiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -309,6 +346,7 @@ define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: sdiv_v4i16:
 ; CHECK: sshll v1.4s, v1.4h, #0
@@ -323,6 +361,15 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-NEXT: mov v0.h[2], w9
 ; CHECK-NEXT: mov v0.h[3], w8
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: sdiv_v4i16:
+; VBITS_EQ_128:         sshll v1.4s, v1.4h, #0
+; VBITS_EQ_128-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    xtn v0.4h, v0.4s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = sdiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -335,6 +382,18 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
 ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: sdiv_v8i16:
+; VBITS_EQ_128:         ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_EQ_128-NEXT:    sunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_EQ_128-NEXT:    ret
+
   %res = sdiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -472,6 +531,12 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
 ; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: sdiv_v2i32:
+; VBITS_EQ_128:         ptrue p0.s, vl2
+; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = sdiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -482,6 +547,12 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
 ; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: sdiv_v4i32:
+; VBITS_EQ_128:         ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = sdiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -552,6 +623,12 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
 ; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: sdiv_v1i64:
+; VBITS_EQ_128:         ptrue p0.d, vl1
+; VBITS_EQ_128-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT:    ret
+
   %res = sdiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -562,6 +639,12 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
 ; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: sdiv_v2i64:
+; VBITS_EQ_128:         ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT:    ret
+
   %res = sdiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -631,6 +714,7 @@ define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Vector vXi8 udiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-LABEL: udiv_v8i8:
 ; CHECK: ptrue [[PG0:p[0-9]+]].s, vl8
@@ -657,6 +741,21 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7]
 ; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: udiv_v8i8:
+; VBITS_EQ_128:         ushll v1.8h, v1.8b, #0
+; VBITS_EQ_128-NEXT:    ushll v0.8h, v0.8b, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_EQ_128-NEXT:    uunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_EQ_128-NEXT:    xtn v0.8b, v0.8h
+; VBITS_EQ_128-NEXT:    ret
+
   %res = udiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -687,6 +786,30 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_512-NEXT: uzp1 [[RES1:z[0-9]+]].h, [[DIV]].h, [[DIV]].h
 ; VBITS_GE_512-NEXT: uzp1 [[RES2:z[0-9]+]].b, [[RES1]].b, [[RES1]].b
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: udiv_v16i8:
+; VBITS_EQ_128:         uunpkhi z2.h, z1.b
+; VBITS_EQ_128-NEXT:    uunpkhi z3.h, z0.b
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    uunpklo z1.h, z1.b
+; VBITS_EQ_128-NEXT:    uunpkhi z4.s, z2.h
+; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z3.h
+; VBITS_EQ_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_EQ_128-NEXT:    uunpklo z0.h, z0.b
+; VBITS_EQ_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z1.h
+; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z0.h
+; VBITS_EQ_128-NEXT:    uunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_EQ_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
+; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    uzp1 z1.h, z2.h, z4.h
+; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z3.h
+; VBITS_EQ_128-NEXT:    uzp1 z0.b, z0.b, z1.b
+; VBITS_EQ_128-NEXT:    ret
+
   %res = udiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -913,6 +1036,7 @@ define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Vector vXi16 udiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: udiv_v4i16:
 ; CHECK: ushll v1.4s, v1.4h, #0
@@ -927,6 +1051,15 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-NEXT: mov v0.h[2], w9
 ; CHECK-NEXT: mov v0.h[3], w8
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: udiv_v4i16:
+; VBITS_EQ_128:         ushll v1.4s, v1.4h, #0
+; VBITS_EQ_128-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    xtn v0.4h, v0.4s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = udiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -939,6 +1072,18 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
 ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: udiv_v8i16:
+; VBITS_EQ_128:         ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_EQ_128-NEXT:    uunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_EQ_128-NEXT:    ret
+
   %res = udiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -1076,6 +1221,12 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
 ; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: udiv_v2i32:
+; VBITS_EQ_128:         ptrue p0.s, vl2
+; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = udiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -1086,6 +1237,12 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
 ; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: udiv_v4i32:
+; VBITS_EQ_128:         ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = udiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -1098,6 +1255,7 @@ define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 ; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 ; CHECK-NEXT: ret
+
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %res = udiv <8 x i32> %op1, %op2
@@ -1156,6 +1314,12 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
 ; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: udiv_v1i64:
+; VBITS_EQ_128:         ptrue p0.d, vl1
+; VBITS_EQ_128-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT:    ret
+
   %res = udiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -1166,6 +1330,12 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
 ; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: udiv_v2i64:
+; VBITS_EQ_128:         ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT:    ret
+
   %res = udiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 8137d601c97f8..c86e4f13dade3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=VBITS_EQ_128
 ; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
 ; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
@@ -17,14 +17,12 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; SREM
 ;
 
 ; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-LABEL: srem_v8i8:
 ; CHECK: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2:z[0-9]+]].b
@@ -52,6 +50,22 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR8]]
 ; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: srem_v8i8:
+; VBITS_EQ_128:         sshll v2.8h, v1.8b, #0
+; VBITS_EQ_128-NEXT:    sshll v3.8h, v0.8b, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sunpkhi z4.s, z2.h
+; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z3.h
+; VBITS_EQ_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_EQ_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    uzp1 z2.h, z2.h, z4.h
+; VBITS_EQ_128-NEXT:    xtn v2.8b, v2.8h
+; VBITS_EQ_128-NEXT:    mls v0.8b, v2.8b, v1.8b
+; VBITS_EQ_128-NEXT:    ret
+
   %res = srem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -84,6 +98,31 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
 ; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: srem_v16i8:
+; VBITS_EQ_128:         sunpkhi z2.h, z1.b
+; VBITS_EQ_128-NEXT:    sunpkhi z3.h, z0.b
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z2.h
+; VBITS_EQ_128-NEXT:    sunpkhi z6.s, z3.h
+; VBITS_EQ_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_EQ_128-NEXT:    sunpklo z4.h, z1.b
+; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    sunpklo z3.h, z0.b
+; VBITS_EQ_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; VBITS_EQ_128-NEXT:    sunpkhi z6.s, z4.h
+; VBITS_EQ_128-NEXT:    sunpkhi z7.s, z3.h
+; VBITS_EQ_128-NEXT:    sunpklo z4.s, z4.h
+; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_EQ_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
+; VBITS_EQ_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; VBITS_EQ_128-NEXT:    uzp1 z2.h, z2.h, z5.h
+; VBITS_EQ_128-NEXT:    uzp1 z3.h, z3.h, z6.h
+; VBITS_EQ_128-NEXT:    uzp1 z2.b, z3.b, z2.b
+; VBITS_EQ_128-NEXT:    mls v0.16b, v2.16b, v1.16b
+; VBITS_EQ_128-NEXT:    ret
+
   %res = srem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -330,6 +369,7 @@ define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: srem_v4i16:
 ; CHECK: sshll v2.4s, v1.4h, #0
@@ -345,6 +385,16 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-NEXT: mov [[VEC2]].h[3], [[SCALAR3]]
 ; CHECK-NEXT: mls v0.4h, [[VEC2]].4h, v1.4h
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: srem_v4i16:
+; VBITS_EQ_128:         sshll v2.4s, v1.4h, #0
+; VBITS_EQ_128-NEXT:    sshll v3.4s, v0.4h, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    xtn v2.4h, v2.4s
+; VBITS_EQ_128-NEXT:    mls v0.4h, v2.4h, v1.4h
+; VBITS_EQ_128-NEXT:    ret
+
   %res = srem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -358,6 +408,20 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
 ; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: srem_v8i16:
+; VBITS_EQ_128:         ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sunpkhi z2.s, z1.h
+; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_EQ_128-NEXT:    sunpklo z4.s, z1.h
+; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    sunpklo z5.s, z0.h
+; VBITS_EQ_128-NEXT:    movprfx z3, z5
+; VBITS_EQ_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; VBITS_EQ_128-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_EQ_128-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_EQ_128-NEXT:    ret
+
   %res = srem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -395,6 +459,7 @@ define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 ; VBITS_GE_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h
 ; VBITS_GE_512-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0]
 ; CHECK: ret
+
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
   %res = srem <16 x i16> %op1, %op2
@@ -513,6 +578,14 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-NEXT: sdiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
 ; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: srem_v2i32:
+; VBITS_EQ_128:         ptrue p0.s, vl2
+; VBITS_EQ_128-NEXT:    movprfx z2, z0
+; VBITS_EQ_128-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
+; VBITS_EQ_128-NEXT:    mls v0.2s, v2.2s, v1.2s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = srem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -525,6 +598,14 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 ; CHECK-NEXT: sdiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
 ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
 ; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: srem_v4i32:
+; VBITS_EQ_128:         ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    movprfx z2, z0
+; VBITS_EQ_128-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
+; VBITS_EQ_128-NEXT:    mls v0.4s, v2.4s, v1.4s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = srem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -602,6 +683,7 @@ define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for the 128 bits case here.
 define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK-LABEL: srem_v1i64:
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
@@ -610,11 +692,24 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
 ; CHECK-NEXT: sub d0, d0, d1
 ; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: srem_v1i64:
+; VBITS_EQ_128:         ptrue p0.d, vl1
+; VBITS_EQ_128-NEXT:    movprfx z2, z0
+; VBITS_EQ_128-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
+; VBITS_EQ_128-NEXT:    fmov x8, d2
+; VBITS_EQ_128-NEXT:    fmov x9, d1
+; VBITS_EQ_128-NEXT:    mul x8, x8, x9
+; VBITS_EQ_128-NEXT:    fmov d1, x8
+; VBITS_EQ_128-NEXT:    sub d0, d0, d1
+; VBITS_EQ_128-NEXT:    ret
+
   %res = srem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for the 128 bits case here.
 define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-LABEL: srem_v2i64:
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
@@ -623,6 +718,22 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: srem_v2i64:
+; VBITS_EQ_128:         ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT:    movprfx z2, z0
+; VBITS_EQ_128-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
+; VBITS_EQ_128-NEXT:    fmov x9, d2
+; VBITS_EQ_128-NEXT:    fmov x10, d1
+; VBITS_EQ_128-NEXT:    mov x8, v2.d[1]
+; VBITS_EQ_128-NEXT:    mov x11, v1.d[1]
+; VBITS_EQ_128-NEXT:    mul x9, x9, x10
+; VBITS_EQ_128-NEXT:    mul x8, x8, x11
+; VBITS_EQ_128-NEXT:    fmov d1, x9
+; VBITS_EQ_128-NEXT:    mov v1.d[1], x8
+; VBITS_EQ_128-NEXT:    sub v0.2d, v0.2d, v1.2d
+; VBITS_EQ_128-NEXT:    ret
+
   %res = srem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -704,6 +815,7 @@ define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Vector vXi8 udiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-LABEL: urem_v8i8:
 ; CHECK: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2:z[0-9]+]].b
@@ -731,6 +843,22 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]]
 ; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: urem_v8i8:
+; VBITS_EQ_128:         ushll v2.8h, v1.8b, #0
+; VBITS_EQ_128-NEXT:    ushll v3.8h, v0.8b, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    uunpkhi z4.s, z2.h
+; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z3.h
+; VBITS_EQ_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_EQ_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    uzp1 z2.h, z2.h, z4.h
+; VBITS_EQ_128-NEXT:    xtn v2.8b, v2.8h
+; VBITS_EQ_128-NEXT:    mls v0.8b, v2.8b, v1.8b
+; VBITS_EQ_128-NEXT:    ret
+
   %res = urem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -763,6 +891,31 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b
 ; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: urem_v16i8:
+; VBITS_EQ_128:         uunpkhi z2.h, z1.b
+; VBITS_EQ_128-NEXT:    uunpkhi z3.h, z0.b
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z2.h
+; VBITS_EQ_128-NEXT:    uunpkhi z6.s, z3.h
+; VBITS_EQ_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_EQ_128-NEXT:    uunpklo z4.h, z1.b
+; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    uunpklo z3.h, z0.b
+; VBITS_EQ_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; VBITS_EQ_128-NEXT:    uunpkhi z6.s, z4.h
+; VBITS_EQ_128-NEXT:    uunpkhi z7.s, z3.h
+; VBITS_EQ_128-NEXT:    uunpklo z4.s, z4.h
+; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_EQ_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
+; VBITS_EQ_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; VBITS_EQ_128-NEXT:    uzp1 z2.h, z2.h, z5.h
+; VBITS_EQ_128-NEXT:    uzp1 z3.h, z3.h, z6.h
+; VBITS_EQ_128-NEXT:    uzp1 z2.b, z3.b, z2.b
+; VBITS_EQ_128-NEXT:    mls v0.16b, v2.16b, v1.16b
+; VBITS_EQ_128-NEXT:    ret
+
   %res = urem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -1007,6 +1160,7 @@ define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Vector vXi16 udiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for >= 256 bits here.
 define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: urem_v4i16:
 ; CHECK: ushll v2.4s, v1.4h, #0
@@ -1022,6 +1176,16 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-NEXT: mov [[VECO]].h[3], [[SCALAR3]]
 ; CHECK-NEXT: mls v0.4h, [[VECO]].4h, v1.4h
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: urem_v4i16:
+; VBITS_EQ_128:         ushll v2.4s, v1.4h, #0
+; VBITS_EQ_128-NEXT:    ushll v3.4s, v0.4h, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    xtn v2.4h, v2.4s
+; VBITS_EQ_128-NEXT:    mls v0.4h, v2.4h, v1.4h
+; VBITS_EQ_128-NEXT:    ret
+
   %res = urem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -1035,6 +1199,20 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h
 ; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: urem_v8i16:
+; VBITS_EQ_128:         ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    uunpkhi z2.s, z1.h
+; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_EQ_128-NEXT:    uunpklo z4.s, z1.h
+; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_EQ_128-NEXT:    uunpklo z5.s, z0.h
+; VBITS_EQ_128-NEXT:    movprfx z3, z5
+; VBITS_EQ_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; VBITS_EQ_128-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_EQ_128-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_EQ_128-NEXT:    ret
+
   %res = urem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -1190,6 +1368,14 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-NEXT: udiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
 ; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: urem_v2i32:
+; VBITS_EQ_128:         ptrue p0.s, vl2
+; VBITS_EQ_128-NEXT:    movprfx z2, z0
+; VBITS_EQ_128-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
+; VBITS_EQ_128-NEXT:    mls v0.2s, v2.2s, v1.2s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = urem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -1202,6 +1388,14 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 ; CHECK-NEXT: udiv z2.s, [[PG]]/m, [[PFX]].s, z1.s
 ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
 ; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: urem_v4i32:
+; VBITS_EQ_128:         ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    movprfx z2, z0
+; VBITS_EQ_128-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
+; VBITS_EQ_128-NEXT:    mls v0.4s, v2.4s, v1.4s
+; VBITS_EQ_128-NEXT:    ret
+
   %res = urem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -1279,6 +1473,7 @@ define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 }
 
 ; Vector i64 udiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for the 128 bits case here.
 define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK-LABEL: urem_v1i64:
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
@@ -1287,11 +1482,24 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
 ; CHECK-NEXT: sub d0, d0, d1
 ; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: urem_v1i64:
+; VBITS_EQ_128:         ptrue p0.d, vl1
+; VBITS_EQ_128-NEXT:    movprfx z2, z0
+; VBITS_EQ_128-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
+; VBITS_EQ_128-NEXT:    fmov x8, d2
+; VBITS_EQ_128-NEXT:    fmov x9, d1
+; VBITS_EQ_128-NEXT:    mul x8, x8, x9
+; VBITS_EQ_128-NEXT:    fmov d1, x8
+; VBITS_EQ_128-NEXT:    sub d0, d0, d1
+; VBITS_EQ_128-NEXT:    ret
+
   %res = urem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
 ; Vector i64 udiv are not legal for NEON so use SVE when available.
+; FIXME: We should be able to improve the codegen for the 128 bits case here.
 define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-LABEL: urem_v2i64:
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
@@ -1300,6 +1508,22 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-NEXT: mul z1.d, [[PG]]/m, [[OP2]].d, [[DIV]].d
 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: urem_v2i64:
+; VBITS_EQ_128:         ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT:    movprfx z2, z0
+; VBITS_EQ_128-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
+; VBITS_EQ_128-NEXT:    fmov x9, d2
+; VBITS_EQ_128-NEXT:    fmov x10, d1
+; VBITS_EQ_128-NEXT:    mov x8, v2.d[1]
+; VBITS_EQ_128-NEXT:    mov x11, v1.d[1]
+; VBITS_EQ_128-NEXT:    mul x9, x9, x10
+; VBITS_EQ_128-NEXT:    mul x8, x8, x11
+; VBITS_EQ_128-NEXT:    fmov d1, x9
+; VBITS_EQ_128-NEXT:    mov v1.d[1], x8
+; VBITS_EQ_128-NEXT:    sub v0.2d, v0.2d, v1.2d
+; VBITS_EQ_128-NEXT:    ret
+
   %res = urem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }