[llvm] a850dbc - [AArch64] Sink vscale calls into loops for better isel (#70304)

Tue Nov 7 02:29:46 PST 2023

Author: Graham Hunter
Date: 2023-11-07T10:29:42Z
New Revision: a850dbcc5cf146b071a82812ea8cb29d4feb8709

URL: https://github.com/llvm/llvm-project/commit/a850dbcc5cf146b071a82812ea8cb29d4feb8709
DIFF: https://github.com/llvm/llvm-project/commit/a850dbcc5cf146b071a82812ea8cb29d4feb8709.diff

LOG: [AArch64] Sink vscale calls into loops for better isel (#70304)

For more recent sve capable CPUs it is beneficial to use the inc*
instruction
to increment a value by vscale (potentially shifted or multiplied) even
in
short loops.

This patch tells codegenprepare to sink appropriate vscale calls into
blocks where they are used so that isel can match them.

Added: 
    llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll

Modified: 
    llvm/lib/CodeGen/CodeGenPrepare.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
    llvm/test/CodeGen/AArch64/sve-int-arith.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 187820717b6fd5c..fa0c753bff2f073 100644

--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8230,7 +8230,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
       return true;
     }
-    return false;
   }
 
   if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f5193a9f2adf30c..6ea7e6813d4ce37 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14552,6 +14552,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
   return true;
 }
 
+/// We want to sink following cases:
+/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
+static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
+  if (match(Op, m_VScale()))
+    return true;
+  if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
+      match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
+    Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
+    return true;
+  }
+  return false;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -14668,6 +14681,22 @@ bool AArch64TargetLowering::shouldSinkOperands(
     }
   }
 
+  // Sink vscales closer to uses for better isel
+  switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:
+  case Instruction::Add:
+  case Instruction::Sub:
+    for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
+      if (shouldSinkVScale(I->getOperand(Op), Ops)) {
+        Ops.push_back(&I->getOperandUse(Op));
+        return true;
+      }
+    }
+    break;
+  default:
+    break;
+  }
+
   if (!I->getType()->isVectorTy())
     return false;
 

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index bb58248c6f60e09..956d2d941ac714d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -18,10 +18,10 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    neg x10, x9
-; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
+; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
@@ -33,7 +33,7 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
+; CHECK-NEXT:    adds x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -106,11 +106,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    fmov d2, #2.00000000
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    neg x9, x9
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
+; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:    sel z3.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    mov z1.d, p0/m, z2.d
@@ -125,7 +125,7 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
+; CHECK-NEXT:    adds x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -193,34 +193,34 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntw x9
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    neg x10, x9
-; CHECK-NEXT:    mov w11, #1000 // =0x3e8
-; CHECK-NEXT:    rdvl x13, #2
+; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    mov w10, #1000 // =0x3e8
+; CHECK-NEXT:    rdvl x12, #2
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
+; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    rdvl x11, #4
-; CHECK-NEXT:    add x12, x1, x13
-; CHECK-NEXT:    add x13, x0, x13
+; CHECK-NEXT:    add x11, x1, x12
+; CHECK-NEXT:    add x12, x0, x12
+; CHECK-NEXT:    rdvl x13, #4
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x14, x0, x8
-; CHECK-NEXT:    add x15, x13, x8
+; CHECK-NEXT:    add x15, x12, x8
 ; CHECK-NEXT:    add x16, x1, x8
-; CHECK-NEXT:    add x17, x12, x8
+; CHECK-NEXT:    add x17, x11, x8
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x0, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x13, x8]
+; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x12, x8]
 ; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x15, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z16.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x16, #1, mul vl]
-; CHECK-NEXT:    ld1b { z18.b }, p1/z, [x12, x8]
+; CHECK-NEXT:    ld1b { z18.b }, p1/z, [x11, x8]
 ; CHECK-NEXT:    ld1d { z19.d }, p0/z, [x17, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
-; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    adds x10, x10, x9
+; CHECK-NEXT:    add x8, x8, x13
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z16.d, z4.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z17.d, z5.d, #0
 ; CHECK-NEXT:    fcmla z2.d, p0/m, z18.d, z6.d, #0

diff  --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
index 1bace71db0c1184..486f59d7900e95d 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
@@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
 ; CHECK-NEXT:    b.lt .LBB70_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, #1 // =0x1
-; CHECK-NEXT:    whilelo p0.s, xzr, x9
+; CHECK-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    cntw x10
 ; CHECK-NEXT:  .LBB70_2: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
-; CHECK-NEXT:    mad z1.s, p1/m, z2.s, z0.s
-; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
+; CHECK-NEXT:    mad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
 ; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p0.s, x8, x9
+; CHECK-NEXT:    whilelo p1.s, x8, x9
 ; CHECK-NEXT:    b.mi .LBB70_2
 ; CHECK-NEXT:  .LBB70_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
index 39fe92aae061990..124f81e7864d11e 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
 ; CHECK-NEXT:    whilelt p0.s, wzr, w0
 ; CHECK-NEXT:    b.pl .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w9, wzr
-; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    cntw x9
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    whilelt p0.s, w9, w0
-; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    whilelt p0.s, w8, w0
+; CHECK-NEXT:    add w8, w8, w9
 ; CHECK-NEXT:    b.mi .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: // %exit
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
new file mode 100644
index 000000000000000..c80aa82ef9a837d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @inc_add
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds float, ptr %in1, i64 %index
+  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+  %3 = getelementptr inbounds float, ptr %in2, i64 %index
+  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %5 = getelementptr inbounds float, ptr %out, i64 %index
+  store <vscale x 4 x float> %4, ptr %5, align 4
+  %index.next = add nuw i64 %index, %1
+  %6 = icmp eq i64 %index.next, %wide.trip.count
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @dec_sub
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = zext i32 %N to i64
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = sub nsw i64 1, %2
+  %invariant.gep = getelementptr float, ptr %in1, i64 %3
+  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %offset.idx = sub i64 %0, %index
+  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+  store <vscale x 4 x float> %4, ptr %gep23, align 4
+  %index.next = add nuw i64 %index, %2
+  %5 = icmp eq i64 %index.next, %0
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %val) #0 {
+; CHECK-LABEL: define void @gep
+; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr nocapture noundef writeonly [[PTR:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 4
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 4
+; CHECK-NEXT:    [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl i64 %0, 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph.new
+  %lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ]
+  %ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ]
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %ptr.addr, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.1, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.2, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %1
+  %lsr.iv.next = add i32 %lsr.iv, -4
+  %cmp = icmp eq i32 %lsr.iv.next, 0
+  br i1 %cmp, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }