[llvm] [AArch64] Sink vscale calls into loops for better isel (PR #70304)

Tue Oct 31 03:07:06 PDT 2023

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/70304

>From 4287197c3880522b4d163fefd0d7d70ee39034b7 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 26 Oct 2023 09:21:36 +0100
Subject: [PATCH 1/3] [AArch64] Sink vscale calls into loops for better isel

For more recent sve capable CPUs it is beneficial to use the inc* instruction
to increment a value by vscale (potentially shifted or multiplied) even in
short loops.

This patch tells codegenprepare to sink appropriate vscale calls into
blocks where they are used so that isel can match them.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  40 ++++++-
 .../AArch64/sve2-vscale-sinking-codegen.ll    |  96 +++++++++++++++
 .../CodeGen/AArch64/sve2-vscale-sinking.ll    | 112 ++++++++++++++++++
 3 files changed, 245 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 038c23b5e8d50ad..5b4f6531244c259 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14507,6 +14507,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
   return true;
 }
 
+/// We want to sink following cases:
+/// (add|sub) A, ((mul|shl) vscale, imm); (add|sub) A, vscale
+static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
+  if (match(Op, m_VScale()))
+    return true;
+  if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
+      match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
+    Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
+    return true;
+  }
+  return false;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -14623,12 +14636,29 @@ bool AArch64TargetLowering::shouldSinkOperands(
     }
   }
 
-  if (!I->getType()->isVectorTy())
-    return false;
-
   switch (I->getOpcode()) {
   case Instruction::Sub:
   case Instruction::Add: {
+    // If the subtarget wants to make use of sve inc* instructions, then sink
+    // vscale intrinsic (along with any shifts or multiplies) so that the
+    // appropriate folds can be made.
+    if (Subtarget->useScalarIncVL()) {
+      bool Sink = false;
+      if (shouldSinkVScale(I->getOperand(0), Ops)) {
+        Ops.push_back(&I->getOperandUse(0));
+        Sink = true;
+      }
+      
+      if (shouldSinkVScale(I->getOperand(1), Ops)) {
+        Ops.push_back(&I->getOperandUse(1));
+        Sink = true;
+      }
+
+      if (Sink)
+        return true;
+    }
+    if (!I->getType()->isVectorTy())
+      return false;
     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
       return false;
 
@@ -14647,6 +14677,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return true;
   }
   case Instruction::Or: {
+    if (!I->getType()->isVectorTy())
+      return false;
     // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
     // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
     if (Subtarget->hasNEON()) {
@@ -14684,6 +14716,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return false;
   }
   case Instruction::Mul: {
+    if (!I->getType()->isVectorTy())
+      return false;
     int NumZExts = 0, NumSExts = 0;
     for (auto &Op : I->operands()) {
       // Make sure we are not already sinking this operand
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
new file mode 100644
index 000000000000000..afd171aeda758d5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: inc_add:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, w1
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x2, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x3, x8, lsl #2]
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x4, x8, lsl #2]
+; CHECK-NEXT:    incw x8
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds float, ptr %in1, i64 %index
+  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+  %3 = getelementptr inbounds float, ptr %in2, i64 %index
+  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %5 = getelementptr inbounds float, ptr %out, i64 %index
+  store <vscale x 4 x float> %4, ptr %5, align 4
+  %index.next = add nuw i64 %index, %1
+  %6 = icmp eq i64 %index.next, %wide.trip.count
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: dec_sub:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    rdvl x9, #-1
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    add x11, x9, #4
+; CHECK-NEXT:    add x9, x2, x11
+; CHECK-NEXT:    add x10, x3, x11
+; CHECK-NEXT:    add x11, x4, x11
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, x8, lsl #2]
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x11, x8, lsl #2]
+; CHECK-NEXT:    decw x8
+; CHECK-NEXT:    cbnz x8, .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = zext i32 %N to i64
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = sub nsw i64 1, %2
+  %invariant.gep = getelementptr float, ptr %in1, i64 %3
+  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %offset.idx = sub i64 %0, %index
+  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+  store <vscale x 4 x float> %4, ptr %gep23, align 4
+  %index.next = add nuw i64 %index, %2
+  %5 = icmp eq i64 %index.next, %0
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
new file mode 100644
index 000000000000000..88d2f468d6a171c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @inc_add
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds float, ptr %in1, i64 %index
+  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+  %3 = getelementptr inbounds float, ptr %in2, i64 %index
+  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %5 = getelementptr inbounds float, ptr %out, i64 %index
+  store <vscale x 4 x float> %4, ptr %5, align 4
+  %index.next = add nuw i64 %index, %1
+  %6 = icmp eq i64 %index.next, %wide.trip.count
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @dec_sub
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = zext i32 %N to i64
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = sub nsw i64 1, %2
+  %invariant.gep = getelementptr float, ptr %in1, i64 %3
+  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %offset.idx = sub i64 %0, %index
+  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+  store <vscale x 4 x float> %4, ptr %gep23, align 4
+  %index.next = add nuw i64 %index, %2
+  %5 = icmp eq i64 %index.next, %0
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }

>From 92f332d7b2be8338525dacee935666a1af69170a Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 26 Oct 2023 10:03:53 +0100
Subject: [PATCH 2/3] remove invalid whitespace

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5b4f6531244c259..bf6d2dd1e029ca2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14648,7 +14648,7 @@ bool AArch64TargetLowering::shouldSinkOperands(
         Ops.push_back(&I->getOperandUse(0));
         Sink = true;
       }
-      
+
       if (shouldSinkVScale(I->getOperand(1), Ops)) {
         Ops.push_back(&I->getOperandUse(1));
         Sink = true;

>From ccd604e6f2e162689a944f934280da6b3553ad0e Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 31 Oct 2023 09:55:16 +0000
Subject: [PATCH 3/3] Simplified logic, added GEP support, removed codegen
 test.

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  1 -
 .../Target/AArch64/AArch64ISelLowering.cpp    | 41 ++++----
 ...rleaving-reductions-predicated-scalable.ll | 51 +++++-----
 ...plex-deinterleaving-reductions-scalable.ll | 76 +++++++--------
 llvm/test/CodeGen/AArch64/sve-int-arith.ll    | 14 +--
 .../CodeGen/AArch64/sve-ptest-removal-sink.ll |  8 +-
 .../AArch64/sve2-vscale-sinking-codegen.ll    | 96 -------------------
 .../CodeGen/AArch64/sve2-vscale-sinking.ll    | 56 +++++++++++
 8 files changed, 145 insertions(+), 198 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 187820717b6fd5c..fa0c753bff2f073 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8230,7 +8230,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
       return true;
     }
-    return false;
   }
 
   if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bf6d2dd1e029ca2..307ded14d6a68e0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14508,7 +14508,7 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
 }
 
 /// We want to sink following cases:
-/// (add|sub) A, ((mul|shl) vscale, imm); (add|sub) A, vscale
+/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
 static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
   if (match(Op, m_VScale()))
     return true;
@@ -14636,29 +14636,28 @@ bool AArch64TargetLowering::shouldSinkOperands(
     }
   }
 
+  // Sink vscales close to uses for better isel
   switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:
+  case Instruction::Add:
   case Instruction::Sub:
-  case Instruction::Add: {
-    // If the subtarget wants to make use of sve inc* instructions, then sink
-    // vscale intrinsic (along with any shifts or multiplies) so that the
-    // appropriate folds can be made.
-    if (Subtarget->useScalarIncVL()) {
-      bool Sink = false;
-      if (shouldSinkVScale(I->getOperand(0), Ops)) {
-        Ops.push_back(&I->getOperandUse(0));
-        Sink = true;
+    for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
+      if (shouldSinkVScale(I->getOperand(Op), Ops)) {
+        Ops.push_back(&I->getOperandUse(Op));
+        return true;
       }
+    }
+    break;
+  default:
+    break;
+  }
 
-      if (shouldSinkVScale(I->getOperand(1), Ops)) {
-        Ops.push_back(&I->getOperandUse(1));
-        Sink = true;
-      }
+  if (!I->getType()->isVectorTy())
+    return false;
 
-      if (Sink)
-        return true;
-    }
-    if (!I->getType()->isVectorTy())
-      return false;
+  switch (I->getOpcode()) {
+  case Instruction::Sub:
+  case Instruction::Add: {
     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
       return false;
 
@@ -14677,8 +14676,6 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return true;
   }
   case Instruction::Or: {
-    if (!I->getType()->isVectorTy())
-      return false;
     // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
     // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
     if (Subtarget->hasNEON()) {
@@ -14716,8 +14713,6 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return false;
   }
   case Instruction::Mul: {
-    if (!I->getType()->isVectorTy())
-      return false;
     int NumZExts = 0, NumSExts = 0;
     for (auto &Op : I->operands()) {
       // Make sure we are not already sinking this operand
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 01fd2b1113b000b..3c6714e67a64646 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -20,25 +20,24 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    whilelo p1.d, xzr, x9
 ; CHECK-NEXT:    cntd x10
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    rdvl x11, #2
-; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    mov x11, x10
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    zip2 p3.d, p1.d, p1.d
-; CHECK-NEXT:    add x13, x0, x8
-; CHECK-NEXT:    add x14, x1, x8
+; CHECK-NEXT:    add x12, x0, x8
+; CHECK-NEXT:    add x13, x1, x8
 ; CHECK-NEXT:    zip1 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    whilelo p1.d, x12, x9
-; CHECK-NEXT:    add x8, x8, x11
-; CHECK-NEXT:    add x12, x12, x10
-; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x13]
-; CHECK-NEXT:    ld1d { z4.d }, p3/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x14]
+; CHECK-NEXT:    whilelo p1.d, x11, x9
+; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    add x11, x11, x10
+; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x12, #1, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x12]
+; CHECK-NEXT:    ld1d { z4.d }, p3/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x13]
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
@@ -121,26 +120,25 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    and x11, x11, x12
-; CHECK-NEXT:    rdvl x12, #2
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
 ; CHECK-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1w { z2.d }, p0/z, [x2, x9, lsl #2]
-; CHECK-NEXT:    add x13, x0, x8
-; CHECK-NEXT:    add x14, x1, x8
+; CHECK-NEXT:    add x12, x0, x8
+; CHECK-NEXT:    add x13, x1, x8
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    add x9, x9, x10
-; CHECK-NEXT:    add x8, x8, x12
+; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    cmp x11, x9
 ; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    zip1 p1.d, p1.d, p1.d
-; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x13, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x13]
-; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x14]
+; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x12, #1, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x12]
+; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x13]
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
@@ -224,26 +222,25 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    cntd x11
-; CHECK-NEXT:    rdvl x12, #2
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1w { z2.d }, p1/z, [x2, x9, lsl #2]
-; CHECK-NEXT:    add x13, x0, x8
-; CHECK-NEXT:    add x14, x1, x8
+; CHECK-NEXT:    add x12, x0, x8
+; CHECK-NEXT:    add x13, x1, x8
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    add x9, x9, x11
-; CHECK-NEXT:    add x8, x8, x12
+; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
 ; CHECK-NEXT:    zip2 p3.d, p1.d, p1.d
 ; CHECK-NEXT:    zip1 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    whilelo p1.d, x9, x10
-; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x13]
-; CHECK-NEXT:    ld1d { z4.d }, p3/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x14]
+; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x12, #1, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x12]
+; CHECK-NEXT:    ld1d { z4.d }, p3/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x13]
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index aefacc605474fa7..c5870ff469fd39c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -18,23 +18,22 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    neg x10, x9
-; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    rdvl x11, #2
+; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x12, x0, x8
-; CHECK-NEXT:    add x13, x1, x8
+; CHECK-NEXT:    add x11, x0, x8
+; CHECK-NEXT:    add x12, x1, x8
 ; CHECK-NEXT:    ld1b { z2.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x11, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
-; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x12, #1, mul vl]
+; CHECK-NEXT:    adds x10, x10, x9
+; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #90
@@ -106,12 +105,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    fmov d2, #2.00000000
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    neg x9, x9
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    rdvl x11, #2
+; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    sel z3.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    mov z1.d, p0/m, z2.d
 ; CHECK-NEXT:    ptrue p0.d
@@ -119,14 +117,14 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z3.d
 ; CHECK-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x12, x0, x8
-; CHECK-NEXT:    add x13, x1, x8
+; CHECK-NEXT:    add x11, x0, x8
+; CHECK-NEXT:    add x12, x1, x8
 ; CHECK-NEXT:    ld1b { z2.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x11, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
-; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x12, #1, mul vl]
+; CHECK-NEXT:    adds x10, x10, x9
+; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #90
@@ -193,33 +191,32 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntw x9
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    neg x10, x9
-; CHECK-NEXT:    mov w11, #1000 // =0x3e8
+; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    mov w10, #1000 // =0x3e8
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    rdvl x11, #4
+; CHECK-NEXT:    and x10, x9, x10
+; CHECK-NEXT:    addvl x11, x1, #2
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    addvl x12, x1, #2
-; CHECK-NEXT:    addvl x13, x0, #2
+; CHECK-NEXT:    addvl x12, x0, #2
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x14, x0, x8
-; CHECK-NEXT:    add x15, x13, x8
-; CHECK-NEXT:    add x16, x1, x8
-; CHECK-NEXT:    add x17, x12, x8
+; CHECK-NEXT:    add x13, x0, x8
+; CHECK-NEXT:    add x14, x12, x8
+; CHECK-NEXT:    add x15, x1, x8
+; CHECK-NEXT:    add x16, x11, x8
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x13, x8]
-; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x15, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x12, x8]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x14, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z16.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x16, #1, mul vl]
-; CHECK-NEXT:    ld1b { z18.b }, p1/z, [x12, x8]
-; CHECK-NEXT:    ld1d { z19.d }, p0/z, [x17, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
-; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x15, #1, mul vl]
+; CHECK-NEXT:    ld1b { z18.b }, p1/z, [x11, x8]
+; CHECK-NEXT:    ld1d { z19.d }, p0/z, [x16, #1, mul vl]
+; CHECK-NEXT:    adds x10, x10, x9
+; CHECK-NEXT:    addvl x8, x8, #4
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z16.d, z4.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z17.d, z5.d, #0
 ; CHECK-NEXT:    fcmla z2.d, p0/m, z18.d, z6.d, #0
@@ -329,7 +326,6 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
 ; CHECK-NEXT:    mov w11, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:    zip2 z0.d, z2.d, z2.d
 ; CHECK-NEXT:    zip1 z1.d, z2.d, z2.d
 ; CHECK-NEXT:  .LBB3_1: // %vector.body
@@ -338,7 +334,7 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
 ; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    add x8, x8, x9
-; CHECK-NEXT:    add x0, x0, x11
+; CHECK-NEXT:    addvl x0, x0, #2
 ; CHECK-NEXT:    cmp x10, x8
 ; CHECK-NEXT:    fadd z0.d, z5.d, z0.d
 ; CHECK-NEXT:    fadd z1.d, z4.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
index 1bace71db0c1184..486f59d7900e95d 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
@@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
 ; CHECK-NEXT:    b.lt .LBB70_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, #1 // =0x1
-; CHECK-NEXT:    whilelo p0.s, xzr, x9
+; CHECK-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    cntw x10
 ; CHECK-NEXT:  .LBB70_2: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
-; CHECK-NEXT:    mad z1.s, p1/m, z2.s, z0.s
-; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
+; CHECK-NEXT:    mad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
 ; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p0.s, x8, x9
+; CHECK-NEXT:    whilelo p1.s, x8, x9
 ; CHECK-NEXT:    b.mi .LBB70_2
 ; CHECK-NEXT:  .LBB70_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
index 39fe92aae061990..124f81e7864d11e 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
 ; CHECK-NEXT:    whilelt p0.s, wzr, w0
 ; CHECK-NEXT:    b.pl .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w9, wzr
-; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    cntw x9
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    whilelt p0.s, w9, w0
-; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    whilelt p0.s, w8, w0
+; CHECK-NEXT:    add w8, w8, w9
 ; CHECK-NEXT:    b.mi .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: // %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
deleted file mode 100644
index afd171aeda758d5..000000000000000
--- a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
+++ /dev/null
@@ -1,96 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s | FileCheck %s
-
-target triple = "aarch64-unknown-linux-gnu"
-
-define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
-; CHECK-LABEL: inc_add:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    mov w9, w1
-; CHECK-NEXT:  .LBB0_1: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x2, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x3, x8, lsl #2]
-; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x4, x8, lsl #2]
-; CHECK-NEXT:    incw x8
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB0_1
-; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-entry:
-  %wide.trip.count = zext i32 %N to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 2
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %2 = getelementptr inbounds float, ptr %in1, i64 %index
-  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
-  %3 = getelementptr inbounds float, ptr %in2, i64 %index
-  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
-  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
-  %5 = getelementptr inbounds float, ptr %out, i64 %index
-  store <vscale x 4 x float> %4, ptr %5, align 4
-  %index.next = add nuw i64 %index, %1
-  %6 = icmp eq i64 %index.next, %wide.trip.count
-  br i1 %6, label %for.cond.cleanup, label %vector.body
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
-; CHECK-LABEL: dec_sub:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    rdvl x9, #-1
-; CHECK-NEXT:    mov w8, w1
-; CHECK-NEXT:    add x11, x9, #4
-; CHECK-NEXT:    add x9, x2, x11
-; CHECK-NEXT:    add x10, x3, x11
-; CHECK-NEXT:    add x11, x4, x11
-; CHECK-NEXT:  .LBB1_1: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, x8, lsl #2]
-; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x11, x8, lsl #2]
-; CHECK-NEXT:    decw x8
-; CHECK-NEXT:    cbnz x8, .LBB1_1
-; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-entry:
-  %0 = zext i32 %N to i64
-  %1 = tail call i64 @llvm.vscale.i64()
-  %2 = shl nuw nsw i64 %1, 2
-  %3 = sub nsw i64 1, %2
-  %invariant.gep = getelementptr float, ptr %in1, i64 %3
-  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
-  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %offset.idx = sub i64 %0, %index
-  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
-  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
-  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
-  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
-  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
-  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
-  store <vscale x 4 x float> %4, ptr %gep23, align 4
-  %index.next = add nuw i64 %index, %2
-  %5 = icmp eq i64 %index.next, %0
-  br i1 %5, label %for.cond.cleanup, label %vector.body
-
-for.cond.cleanup:
-  ret void
-}
-
-declare i64 @llvm.vscale.i64()
-
-attributes #0 = { "target-features"="+sve2" }
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
index 88d2f468d6a171c..c80aa82ef9a837d 100644
--- a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -107,6 +107,62 @@ for.cond.cleanup:
   ret void
 }
 
+define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %val) #0 {
+; CHECK-LABEL: define void @gep
+; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr nocapture noundef writeonly [[PTR:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 4
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 4
+; CHECK-NEXT:    [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl i64 %0, 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph.new
+  %lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ]
+  %ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ]
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %ptr.addr, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.1, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.2, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %1
+  %lsr.iv.next = add i32 %lsr.iv, -4
+  %cmp = icmp eq i32 %lsr.iv.next, 0
+  br i1 %cmp, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
+
 declare i64 @llvm.vscale.i64()
 
 attributes #0 = { "target-features"="+sve2" }