[llvm] WIP:[LV] Support strided memory accesses with a stride of -1 (PR #128718)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 10 01:28:01 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/128718
>From 4ae69dc59ec9fc448db2e38a979563fcc378536f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Feb 2025 00:31:28 -0800
Subject: [PATCH 01/11] Pre-commit for cm_stride
---
.../RISCV/riscv-vector-reverse-output.ll | 417 ++++++++++++++++++
1 file changed, 417 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
new file mode 100644
index 0000000000000..2b8ac0ab685ab
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -0,0 +1,417 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+;; This is the loop in c++ being vectorize in this file with
+;; vector.reverse
+;; #pragma clang loop vectorize_width(4, scalable)
+;; for (int i = N-1; i >= 0; --i)
+;; a[i] = b[i] + 1.0;
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: | FileCheck --check-prefix=RV64 %s
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v \
+; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: | FileCheck --check-prefix=RV32 %s
+
+define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
+; RV64-LABEL: define void @vector_reverse_i64(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-NEXT: [[ENTRY:.*:]]
+; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64: [[FOR_BODY_PREHEADER]]:
+; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64: [[VECTOR_SCEVCHECK]]:
+; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; RV64-NEXT: [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-NEXT: [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
+; RV64-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP19]]
+; RV64-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP14]]
+; RV64-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP14]]
+; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP21]]
+; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]]
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-NEXT: [[TMP25:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
+; RV64-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP14]]
+; RV64-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP14]]
+; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP27]]
+; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP28]]
+; RV64-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP25]])
+; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP30]], align 4
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; RV64-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
+; RV64-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT: [[ADD9:%.*]] = add i32 [[TMP32]], 1
+; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; RV64-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4
+; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+; RV32-LABEL: define void @vector_reverse_i64(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV32-NEXT: [[ENTRY:.*:]]
+; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV32: [[FOR_BODY_PREHEADER]]:
+; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32: [[VECTOR_PH]]:
+; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV32-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; RV32-NEXT: [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT: [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV32: [[VECTOR_BODY]]:
+; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV32-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV32-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
+; RV32-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]]
+; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]]
+; RV32-NEXT: [[TMP15:%.*]] = sub i32 1, [[TMP13]]
+; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]]
+; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP15]]
+; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV32-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; RV32-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP20]]
+; RV32-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP20]]
+; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 [[TMP21]]
+; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP22]]
+; RV32-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP24]], align 4
+; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV32-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV32: [[MIDDLE_BLOCK]]:
+; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV32: [[SCALAR_PH]]:
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT: br label %[[FOR_BODY:.*]]
+; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV32-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV32: [[FOR_COND_CLEANUP]]:
+; RV32-NEXT: ret void
+; RV32: [[FOR_BODY]]:
+; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV32-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
+; RV32-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV32-NEXT: [[ADD9:%.*]] = add i32 [[TMP26]], 1
+; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; RV32-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4
+; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %n, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %0 = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+ %i.0 = add nsw i32 %i.0.in8, -1
+ %idxprom = zext i32 %i.0 to i64
+ %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+ %1 = load i32, ptr %arrayidx, align 4
+ %add9 = add i32 %1, 1
+ %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+ store i32 %add9, ptr %arrayidx3, align 4
+ %cmp = icmp ugt i64 %indvars.iv, 1
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
+; RV64-LABEL: define void @vector_reverse_f32(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*:]]
+; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64: [[FOR_BODY_PREHEADER]]:
+; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64: [[VECTOR_SCEVCHECK]]:
+; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; RV64-NEXT: [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-NEXT: [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
+; RV64-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
+; RV64-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP14]]
+; RV64-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP14]]
+; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]]
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP24]], align 4
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-NEXT: [[TMP25:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
+; RV64-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP14]]
+; RV64-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP14]]
+; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP27]]
+; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP28]]
+; RV64-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP25]])
+; RV64-NEXT: store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP30]], align 4
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; RV64-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
+; RV64-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT: [[CONV1:%.*]] = fadd float [[TMP32]], 1.000000e+00
+; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]]
+; RV64-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4
+; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; RV32-LABEL: define void @vector_reverse_f32(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; RV32-NEXT: [[ENTRY:.*:]]
+; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV32: [[FOR_BODY_PREHEADER]]:
+; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32: [[VECTOR_PH]]:
+; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV32-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; RV32-NEXT: [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT: [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV32: [[VECTOR_BODY]]:
+; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV32-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV32-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
+; RV32-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]]
+; RV32-NEXT: [[TMP15:%.*]] = sub i32 1, [[TMP13]]
+; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]]
+; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP15]]
+; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV32-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; RV32-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP20]]
+; RV32-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP20]]
+; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP21]]
+; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP22]]
+; RV32-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP18]])
+; RV32-NEXT: store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP24]], align 4
+; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV32-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV32: [[MIDDLE_BLOCK]]:
+; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV32: [[SCALAR_PH]]:
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT: br label %[[FOR_BODY:.*]]
+; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV32-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV32: [[FOR_COND_CLEANUP]]:
+; RV32-NEXT: ret void
+; RV32: [[FOR_BODY]]:
+; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV32-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
+; RV32-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV32-NEXT: [[CONV1:%.*]] = fadd float [[TMP26]], 1.000000e+00
+; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]]
+; RV32-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4
+; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %n, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %0 = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+ %i.0 = add nsw i32 %i.0.in8, -1
+ %idxprom = zext i32 %i.0 to i64
+ %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+ %1 = load float, ptr %arrayidx, align 4
+ %conv1 = fadd float %1, 1.000000e+00
+ %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+ store float %conv1, ptr %arrayidx3, align 4
+ %cmp = icmp ugt i64 %indvars.iv, 1
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+;.
+; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; RV64: [[META1]] = !{!"llvm.loop.mustprogress"}
+; RV64: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; RV64: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; RV64: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; RV64: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
+; RV64: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+;.
+; RV32: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; RV32: [[META1]] = !{!"llvm.loop.mustprogress"}
+; RV32: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; RV32: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; RV32: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META3]], [[META2]]}
+; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
+; RV32: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META3]], [[META2]]}
+;.
>From bdcb512e0878e52f26e6fa4a51e6d4ab96f919dd Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Feb 2025 05:14:10 -0800
Subject: [PATCH 02/11] UF2 test
---
.../RISCV/riscv-vector-reverse-output.ll | 221 +++++++++++++++++-
1 file changed, 212 insertions(+), 9 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 2b8ac0ab685ab..a15a8342154ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -7,13 +7,17 @@
;; a[i] = b[i] + 1.0;
; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
-; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: -riscv-v-vector-bits-min=128 -S < %s \
; RUN: | FileCheck --check-prefix=RV64 %s
; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v \
-; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: -riscv-v-vector-bits-min=128 -S < %s \
; RUN: | FileCheck --check-prefix=RV32 %s
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -riscv-v-vector-bits-min=128 -force-vector-interleave=2 -S < %s \
+; RUN: | FileCheck --check-prefix=RV64-UF2 %s
+
define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-LABEL: define void @vector_reverse_i64(
; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -174,6 +178,105 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
;
+; RV64-UF2-LABEL: define void @vector_reverse_i64(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-UF2-NEXT: [[ENTRY:.*:]]
+; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64-UF2: [[FOR_BODY_PREHEADER]]:
+; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64-UF2: [[VECTOR_SCEVCHECK]]:
+; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64-UF2: [[VECTOR_PH]]:
+; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
+; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-UF2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT: [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64-UF2: [[VECTOR_BODY]]:
+; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-UF2-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-UF2-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1
+; RV64-UF2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]]
+; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
+; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
+; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP25]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-UF2-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP29]], align 4
+; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD2]])
+; RV64-UF2-NEXT: [[TMP30:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP31:%.*]] = add <vscale x 4 x i32> [[REVERSE3]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = mul i64 0, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP33]]
+; RV64-UF2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP35]], i64 [[TMP34]]
+; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP38:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP37]]
+; RV64-UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i64 [[TMP38]]
+; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP30]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP36]], align 4
+; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP31]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE5]], ptr [[TMP40]], align 4
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; RV64-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-UF2: [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2: [[SCALAR_PH]]:
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
+; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV64-UF2: [[FOR_COND_CLEANUP]]:
+; RV64-UF2-NEXT: ret void
+; RV64-UF2: [[FOR_BODY]]:
+; RV64-UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV64-UF2-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV64-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
+; RV64-UF2-NEXT: [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV64-UF2-NEXT: [[ADD9:%.*]] = add i32 [[TMP42]], 1
+; RV64-UF2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; RV64-UF2-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4
+; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV64-UF2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+;
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
@@ -360,6 +463,105 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
;
+; RV64-UF2-LABEL: define void @vector_reverse_f32(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT: [[ENTRY:.*:]]
+; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64-UF2: [[FOR_BODY_PREHEADER]]:
+; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64-UF2: [[VECTOR_SCEVCHECK]]:
+; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64-UF2: [[VECTOR_PH]]:
+; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
+; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-UF2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT: [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64-UF2: [[VECTOR_BODY]]:
+; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-UF2-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-UF2-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1
+; RV64-UF2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]]
+; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
+; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
+; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP25]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-UF2-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP29]], align 4
+; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD2]])
+; RV64-UF2-NEXT: [[TMP30:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP31:%.*]] = fadd <vscale x 4 x float> [[REVERSE3]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = mul i64 0, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP33]]
+; RV64-UF2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP35]], i64 [[TMP34]]
+; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP38:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP37]]
+; RV64-UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP39]], i64 [[TMP38]]
+; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP30]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP36]], align 4
+; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP31]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE5]], ptr [[TMP40]], align 4
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; RV64-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV64-UF2: [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2: [[SCALAR_PH]]:
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
+; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV64-UF2: [[FOR_COND_CLEANUP]]:
+; RV64-UF2-NEXT: ret void
+; RV64-UF2: [[FOR_BODY]]:
+; RV64-UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV64-UF2-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV64-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
+; RV64-UF2-NEXT: [[TMP42:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV64-UF2-NEXT: [[CONV1:%.*]] = fadd float [[TMP42]], 1.000000e+00
+; RV64-UF2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]]
+; RV64-UF2-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4
+; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV64-UF2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+;
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
@@ -391,13 +593,6 @@ for.body: ; preds = %for.body.preheader,
!2 = !{!"llvm.loop.vectorize.width", i32 4}
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
-; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
;.
; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
; RV64: [[META1]] = !{!"llvm.loop.mustprogress"}
@@ -415,3 +610,11 @@ for.body: ; preds = %for.body.preheader,
; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
; RV32: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META3]], [[META2]]}
;.
+; RV64-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; RV64-UF2: [[META1]] = !{!"llvm.loop.mustprogress"}
+; RV64-UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; RV64-UF2: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; RV64-UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; RV64-UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
+; RV64-UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+;.
>From fe79615096f4e1c08893d00931b569f6815f81ad Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 19 Feb 2025 01:37:53 -0800
Subject: [PATCH 03/11] Step 1: Patch legal check for strided accesses
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b987863127994..bb5582b183adf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1312,6 +1312,18 @@ class LoopVectorizationCostModel {
(SI && TTI.isLegalMaskedScatter(Ty, Align));
}
+ /// Returns true if the target machine can represent \p V as a strided load
+ /// or store operation.
+ bool isLegalStridedLoadStore(Value *V, ElementCount VF) {
+ if (!isa<LoadInst, StoreInst>(V))
+ return false;
+ auto *Ty = getLoadStoreType(V);
+ Align Align = getLoadStoreAlignment(V);
+ if (VF.isVector())
+ Ty = VectorType::get(Ty, VF);
+ return TTI.isLegalStridedLoadStore(Ty, Align);
+ }
+
/// Returns true if the target machine supports all of the reduction
/// variables found for the given VF.
bool canVectorizeReductions(ElementCount VF) const {
>From 1b15d36e13c0d24c784413a900a285d7670083ed Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 19 Feb 2025 01:38:46 -0800
Subject: [PATCH 04/11] Step 2: Patch cost for strided accesses
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bb5582b183adf..0e7132b1adbdf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1657,6 +1657,9 @@ class LoopVectorizationCostModel {
/// element)
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
+ /// The cost computation for strided load/store instruction.
+ InstructionCost getStridedLoadStoreCost(Instruction *I, ElementCount VF);
+
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -5825,6 +5828,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}
+InstructionCost
+LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
+ ElementCount VF) {
+ Type *ValTy = getLoadStoreType(I);
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
+ const Value *Ptr = getLoadStorePointerOperand(I);
+
+ return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I), Alignment,
+ CostKind, I);
+}
+
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
ElementCount VF,
>From 766c1a3faf7cd0e1ef48e7f7443e031f2d9a5ea7 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 20 Feb 2025 06:09:52 -0800
Subject: [PATCH 05/11] Step 3: Patch the implementation of load/store recipes
---
.../Transforms/Vectorize/LoopVectorize.cpp | 4 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 29 +++--
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 113 ++++++++++++++----
.../Transforms/Vectorize/VPlanTransforms.cpp | 4 +-
.../Transforms/Vectorize/VPlanTest.cpp | 7 +-
5 files changed, 118 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0e7132b1adbdf..0b5de21dae6d3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8382,12 +8382,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+ return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, false,
I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, I->getDebugLoc());
+ Reverse, false, I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f68a2283c0c79..60a90893ff7d4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2570,6 +2570,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;
+ /// Whether the accessed addresses are evenly spaced apart by a fixed stride.
+ bool Strided;
+
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -2583,9 +2586,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse, DebugLoc DL)
+ bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
- Reverse(Reverse) {
+ Reverse(Reverse), Strided(Strided) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
}
@@ -2613,6 +2616,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// order.
bool isReverse() const { return Reverse; }
+ /// Return whether the accessed addresses are evenly spaced apart by a fixed
+ /// stride.
+ bool isStrided() const { return Strided; }
+
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -2642,16 +2649,16 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// optional mask.
struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse, DebugLoc DL)
+ bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, DL),
+ Reverse, Strided, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse,
+ getMask(), Consecutive, Reverse, Strided,
getDebugLoc());
}
@@ -2683,7 +2690,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
{L.getAddr(), &EVL}, L.isConsecutive(),
- L.isReverse(), L.getDebugLoc()),
+ L.isReverse(), L.isStrided(), L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -2720,16 +2727,17 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
/// to store to and an optional mask.
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
- VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
+ VPValue *Mask, bool Consecutive, bool Reverse,
+ bool Strided, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, DL) {
+ Consecutive, Reverse, Strided, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, getDebugLoc());
+ Reverse, Strided, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -2763,7 +2771,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{S.getAddr(), S.getStoredValue(), &EVL},
- S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
+ S.isConsecutive(), S.isReverse(), S.isStrided(),
+ S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5010bf029d140..85daeaff035e6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2566,10 +2566,15 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
- return Ctx.TTI.getAddressComputationCost(Ty) +
- Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, Ctx.CostKind,
- &Ingredient);
+ if (Strided)
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
+ else
+ return Ctx.TTI.getAddressComputationCost(Ty) +
+ Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
}
InstructionCost Cost = 0;
@@ -2596,11 +2601,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive();
+ bool CreateGather = !isConsecutive() && !isStrided();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = nullptr;
+ Value *Mask = isStrided()
+ ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+ : nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2615,9 +2622,25 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
- NewLI =
- Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
- PoisonValue::get(DataTy), "wide.masked.load");
+ if (isStrided()) {
+ const DataLayout &DL = LI->getDataLayout();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+ Value *RuntimeVF =
+ getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+ NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideVal, Mask, RuntimeVF}, nullptr, "wide.strided.load");
+ cast<CallInst>(NewLI)->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ } else {
+ NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.load");
+ }
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
@@ -2655,7 +2678,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive();
+ bool CreateGather = !isConsecutive() && !isStrided();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
@@ -2675,6 +2698,16 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
+ } else if (isStrided()) {
+ const DataLayout &DL = LI->getDataLayout();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+ NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideVal, Mask, EVL}, nullptr, "wide.strided.load");
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
@@ -2729,13 +2762,15 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredVPValue = getStoredValue();
- bool CreateScatter = !isConsecutive();
+ bool CreateScatter = !isConsecutive() && !isStrided();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = nullptr;
+ Value *Mask = isStrided()
+ ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+ : nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2754,12 +2789,32 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
- if (CreateScatter)
+ if (CreateScatter) {
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
- else if (Mask)
- NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
- else
+ } else if (Mask) {
+ if (isStrided()) {
+ const DataLayout &DL = SI->getDataLayout();
+ auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+ Type *StoredEltTy = StoredVecTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+ Value *RuntimeVF =
+ getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+ NewSI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_store,
+ {StoredVecTy, PtrTy, StrideTy},
+ {StoredVal, Addr, StrideVal, Mask, RuntimeVF});
+ cast<CallInst>(NewSI)->addParamAttr(
+ 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
+ } else {
+ NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ }
+ } else {
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
+ }
State.addMetadata(NewSI, SI);
}
@@ -2775,7 +2830,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredValue = getStoredValue();
- bool CreateScatter = !isConsecutive();
+ bool CreateScatter = !isConsecutive() && !isStrided();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
@@ -2800,11 +2855,25 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Store, Type::getVoidTy(EVL->getContext()),
- {StoredVal, Addr}));
+ if (isStrided()) {
+ const DataLayout &DL = SI->getDataLayout();
+ auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+ Type *StoredEltTy = StoredVecTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+ NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+ {StoredVecTy, PtrTy, StrideTy},
+ {StoredVal, Addr, StrideVal, Mask, EVL});
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
+ }
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7646350ca0ed2..25f07fa5a3669 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -73,13 +73,13 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/,
+ false /*Consecutive*/, false /*Reverse*/, false /*Strided*/,
Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- Ingredient.getDebugLoc());
+ false /*Strided*/, Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index f9a85869e3142..08b594f21ec21 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1084,7 +1084,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {});
EXPECT_TRUE(isa<VPUser>(&Recipe));
VPRecipeBase *BaseR = &Recipe;
EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1195,7 +1195,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {});
EXPECT_FALSE(Recipe.mayHaveSideEffects());
EXPECT_TRUE(Recipe.mayReadFromMemory());
EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1209,7 +1209,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
- VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {});
+ VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, false,
+ {});
EXPECT_TRUE(Recipe.mayHaveSideEffects());
EXPECT_FALSE(Recipe.mayReadFromMemory());
EXPECT_TRUE(Recipe.mayWriteToMemory());
>From 8b284829466d69d7b9e740f975b389fe3486ee62 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 21 Feb 2025 02:38:30 -0800
Subject: [PATCH 06/11] Step 4: Connect the recipes and cost by instWidening
---
.../Transforms/Vectorize/LoopVectorize.cpp | 21 ++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0b5de21dae6d3..0b9b9be249594 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1147,6 +1147,7 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
+ CM_Strided,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -6160,6 +6161,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
"Expected consecutive stride.");
InstWidening Decision =
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ // Consider using strided load/store for consecutive reverse accesses to
+ // achieve more efficient memory operations.
+ if (ConsecutiveStride == -1) {
+ const InstructionCost StridedLoadStoreCost =
+ isLegalStridedLoadStore(&I, VF) ? getStridedLoadStoreCost(&I, VF)
+ : InstructionCost::getInvalid();
+ if (StridedLoadStoreCost < Cost) {
+ Decision = CM_Strided;
+ Cost = StridedLoadStoreCost;
+ }
+ }
setWideningDecision(&I, VF, Decision, Cost);
continue;
}
@@ -6806,6 +6818,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return TTI::CastContextHint::Normal;
switch (getWideningDecision(I, VF)) {
+ // TODO: New CastContextHint for strided accesses.
+ case LoopVectorizationCostModel::CM_Strided:
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
case LoopVectorizationCostModel::CM_Interleave:
@@ -8356,6 +8370,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
@@ -8382,12 +8397,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, false,
- I->getDebugLoc());
+ return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+ Strided, I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, false, I->getDebugLoc());
+ Reverse, Strided, I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
>From f155f9115814a757334a060575066cbae1ddc16d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Feb 2025 23:26:03 -0800
Subject: [PATCH 07/11] Opt: Remove dependency on VPWidenIntOrFpInductionRecipe
and expend VPVectorPointerRecipe
---
.../Transforms/Vectorize/LoopVectorize.cpp | 20 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 11 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 +-
.../RISCV/riscv-vector-reverse-output.ll | 194 ++++++++----------
...-force-tail-with-evl-reverse-load-store.ll | 42 +---
...orize-force-tail-with-evl-uniform-store.ll | 9 +-
6 files changed, 119 insertions(+), 161 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0b9b9be249594..92c69f7a6d16d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3621,9 +3621,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (IsUniformMemOpUse(I))
return true;
- return (WideningDecision == CM_Widen ||
- WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Interleave);
+ return (
+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
};
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -8367,17 +8367,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
// reverse consecutive.
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, Range.Start);
+
+ auto SameWiden = [&](ElementCount VF) -> bool {
+ return Decision == CM.getWideningDecision(I, VF);
+ };
+ bool ContainsWidenVF =
+ LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range);
+ assert(ContainsWidenVF &&
+ "At least widen the memory accesses by the Start VF.");
+
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
- if (Consecutive) {
+ if (Consecutive || Strided) {
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
if (Reverse) {
+ assert(!Strided && "Reverse and Strided are mutually exclusive.");
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
@@ -8388,7 +8398,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VectorPtr = new VPReverseVectorPointerRecipe(
Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
} else {
- VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+ VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided,
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
I->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 60a90893ff7d4..cc36f4a56af51 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1571,12 +1571,15 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<1> {
Type *IndexedTy;
+ /// Indicate whether to compute the pointer for strided memory accesses.
+ bool Strided;
+
public:
- VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
- DebugLoc DL)
+ VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
+ GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
GEPFlags, DL),
- IndexedTy(IndexedTy) {}
+ IndexedTy(IndexedTy), Strided(Strided) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
@@ -1597,7 +1600,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
}
VPVectorPointerRecipe *clone() override {
- return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
+ return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
getGEPNoWrapFlags(), getDebugLoc());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 85daeaff035e6..5060de04b266d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2125,7 +2125,9 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
- Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
+ // TODO: Support non-unit-reverse strided accesses.
+ int64_t Step = Strided ? -1 * CurrentPart : CurrentPart;
+ Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Step);
Value *ResultPtr =
Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index a15a8342154ff..96828b644ffb7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -62,20 +62,16 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
; RV64-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP19]]
-; RV64-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP14]]
-; RV64-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP14]]
-; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP21]]
-; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]]
-; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
-; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
+; RV64-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 4
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP21]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP23]])
; RV64-NEXT: [[TMP25:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
-; RV64-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP14]]
-; RV64-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP14]]
-; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP27]]
-; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP28]]
-; RV64-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP25]])
-; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP30]], align 4
+; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
+; RV64-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 4
+; RV64-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP25]], ptr align 4 [[TMP29]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP28]])
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
; RV64-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV64-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -84,7 +80,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; RV64: [[SCALAR_PH]]:
; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
; RV64-NEXT: br label %[[FOR_BODY:.*]]
; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; RV64-NEXT: br label %[[FOR_COND_CLEANUP]]
@@ -92,7 +88,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-NEXT: ret void
; RV64: [[FOR_BODY]]:
; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
; RV64-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
; RV64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
@@ -134,22 +130,16 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV32-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
; RV32-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]]
-; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]]
-; RV32-NEXT: [[TMP15:%.*]] = sub i32 1, [[TMP13]]
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]]
-; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP15]]
-; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
-; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; RV32-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP13]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
; RV32-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
-; RV32-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP20]]
-; RV32-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP20]]
-; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 [[TMP21]]
-; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP22]]
-; RV32-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP24]], align 4
+; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; RV32-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP20:%.*]] = mul i32 [[TMP22]], 4
+; RV32-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP21]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; RV32-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV32-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -158,7 +148,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; RV32: [[SCALAR_PH]]:
; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
-; RV32-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
; RV32-NEXT: br label %[[FOR_BODY:.*]]
; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; RV32-NEXT: br label %[[FOR_COND_CLEANUP]]
@@ -166,7 +156,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV32-NEXT: ret void
; RV32: [[FOR_BODY]]:
; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
; RV32-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
; RV32-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
@@ -208,8 +198,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; RV64-UF2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 8
; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
; RV64-UF2-NEXT: [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]]
@@ -222,33 +211,29 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1
; RV64-UF2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
+; RV64-UF2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 [[TMP22]], 4294967292
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP25]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-UF2-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP29]], align 4
-; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD2]])
+; RV64-UF2-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP29:%.*]] = mul i32 [[TMP25]], 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP23]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP29]])
+; RV64-UF2-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP34:%.*]] = mul i32 [[TMP27]], 4
+; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP28]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP34]])
; RV64-UF2-NEXT: [[TMP30:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-UF2-NEXT: [[TMP31:%.*]] = add <vscale x 4 x i32> [[REVERSE3]], splat (i32 1)
; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; RV64-UF2-NEXT: [[TMP33:%.*]] = mul i64 0, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP33]]
-; RV64-UF2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP35]], i64 [[TMP34]]
-; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 -1, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP38:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
+; RV64-UF2-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 [[TMP33]], 4294967292
; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP37]]
-; RV64-UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i64 [[TMP38]]
-; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP30]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP36]], align 4
-; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP31]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE5]], ptr [[TMP40]], align 4
+; RV64-UF2-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i32 [[TMP36]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP30]], ptr align 4 [[TMP35]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP40]])
+; RV64-UF2-NEXT: [[TMP38:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP43:%.*]] = mul i32 [[TMP38]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP31]], ptr align 4 [[TMP39]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]])
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
; RV64-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -257,7 +242,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; RV64-UF2: [[SCALAR_PH]]:
; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-UF2-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]]
@@ -265,7 +250,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: ret void
; RV64-UF2: [[FOR_BODY]]:
; RV64-UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
; RV64-UF2-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
; RV64-UF2-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
; RV64-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
@@ -347,20 +332,16 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
; RV64-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
-; RV64-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP14]]
-; RV64-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP14]]
-; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
-; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]]
-; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP24]], align 4
-; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
+; RV64-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 4
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP21]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP23]])
; RV64-NEXT: [[TMP25:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; RV64-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP14]]
-; RV64-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP14]]
-; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP27]]
-; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP28]]
-; RV64-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP25]])
-; RV64-NEXT: store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP30]], align 4
+; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i32 0
+; RV64-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 4
+; RV64-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP25]], ptr align 4 [[TMP29]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP28]])
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
; RV64-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV64-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -369,7 +350,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; RV64: [[SCALAR_PH]]:
; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
; RV64-NEXT: br label %[[FOR_BODY:.*]]
; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; RV64-NEXT: br label %[[FOR_COND_CLEANUP]]
@@ -377,7 +358,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-NEXT: ret void
; RV64: [[FOR_BODY]]:
; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
; RV64-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
; RV64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
@@ -419,22 +400,16 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV32-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
; RV32-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
-; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]]
-; RV32-NEXT: [[TMP15:%.*]] = sub i32 1, [[TMP13]]
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]]
-; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP15]]
-; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
-; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; RV32-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP13]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
; RV32-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
-; RV32-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP20]]
-; RV32-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP20]]
-; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP21]]
-; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP22]]
-; RV32-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP18]])
-; RV32-NEXT: store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP24]], align 4
+; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 0
+; RV32-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP20:%.*]] = mul i32 [[TMP22]], 4
+; RV32-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP21]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; RV32-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV32-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -443,7 +418,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; RV32: [[SCALAR_PH]]:
; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
-; RV32-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
; RV32-NEXT: br label %[[FOR_BODY:.*]]
; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; RV32-NEXT: br label %[[FOR_COND_CLEANUP]]
@@ -451,7 +426,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV32-NEXT: ret void
; RV32: [[FOR_BODY]]:
; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
; RV32-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
; RV32-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
@@ -493,8 +468,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; RV64-UF2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 8
; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
; RV64-UF2-NEXT: [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]]
@@ -507,33 +481,29 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1
; RV64-UF2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0
+; RV64-UF2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 [[TMP22]], 4294967292
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP25]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-UF2-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP29]], align 4
-; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD2]])
+; RV64-UF2-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP29:%.*]] = mul i32 [[TMP25]], 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP23]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP29]])
+; RV64-UF2-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP34:%.*]] = mul i32 [[TMP27]], 4
+; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP28]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP34]])
; RV64-UF2-NEXT: [[TMP30:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-UF2-NEXT: [[TMP31:%.*]] = fadd <vscale x 4 x float> [[REVERSE3]], splat (float 1.000000e+00)
; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
-; RV64-UF2-NEXT: [[TMP33:%.*]] = mul i64 0, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP33]]
-; RV64-UF2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP35]], i64 [[TMP34]]
-; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 -1, [[TMP14]]
-; RV64-UF2-NEXT: [[TMP38:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i32 0
+; RV64-UF2-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 [[TMP33]], 4294967292
; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP37]]
-; RV64-UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP39]], i64 [[TMP38]]
-; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP30]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP36]], align 4
-; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP31]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE5]], ptr [[TMP40]], align 4
+; RV64-UF2-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i32 [[TMP36]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP30]], ptr align 4 [[TMP35]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP40]])
+; RV64-UF2-NEXT: [[TMP38:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP43:%.*]] = mul i32 [[TMP38]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP31]], ptr align 4 [[TMP39]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]])
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
; RV64-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -542,7 +512,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; RV64-UF2: [[SCALAR_PH]]:
; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-UF2-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]]
@@ -550,7 +520,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: ret void
; RV64-UF2: [[FOR_BODY]]:
; RV64-UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
; RV64-UF2-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1
; RV64-UF2-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
; RV64-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 5b579b0749c67..1ca50a3f4bf17 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -34,21 +34,11 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], -1
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], ptr align 4 [[TMP14]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -147,24 +137,12 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
; IF-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], ptr align 4 [[TMP21]], i64 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 0829bab26f062..f1286be0ff448 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -38,13 +38,8 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP13:%.*]] = sub nuw nsw i64 1, [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP15]]
-; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP15]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
-; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[TMP14]], i32 0
+; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i64(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP15]], i64 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
>From 9da07d13c3bdc96069e46b48a8023e922d78b3b4 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Mar 2025 15:43:20 +0800
Subject: [PATCH 08/11] Update llvm/lib/Transforms/Vectorize/VPlan.h
Initialize Strided to false at default.
Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index cc36f4a56af51..cd0ca442d7635 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2574,7 +2574,7 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
bool Reverse;
/// Whether the accessed addresses are evenly spaced apart by a fixed stride.
- bool Strided;
+ bool Strided = false;
/// Whether the memory access is masked.
bool IsMasked = false;
>From 96d62b3bee9360ec47c6ee82d94c37d2cdc70269 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Mar 2025 15:45:02 +0800
Subject: [PATCH 09/11] Update llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Remove unnecessary else
Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5060de04b266d..be2f4854af8a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2572,11 +2572,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
IsMasked, Alignment, Ctx.CostKind,
&Ingredient);
- else
- return Ctx.TTI.getAddressComputationCost(Ty) +
- Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, Ctx.CostKind,
- &Ingredient);
+
+ return Ctx.TTI.getAddressComputationCost(Ty) +
+ Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
}
InstructionCost Cost = 0;
>From bd170770b62ad62fef5b92501ed60baf06a67469 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Mar 2025 00:03:42 -0800
Subject: [PATCH 10/11] Fix: Correct the vector pointer when UF>1
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +-
.../RISCV/riscv-vector-reverse-output.ll | 12 +-
.../RISCV/riscv-vector-reverse.ll | 104 +++++++++---------
3 files changed, 64 insertions(+), 61 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index be2f4854af8a9..39f5221801ad5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2125,11 +2125,14 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
+ Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
// TODO: Support non-unit-reverse strided accesses.
- int64_t Step = Strided ? -1 * CurrentPart : CurrentPart;
- Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Step);
+ Value *Index =
+ Strided
+ ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))
+ : Increment;
Value *ResultPtr =
- Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
+ Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 96828b644ffb7..eb75cf969f1b1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -213,7 +213,8 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]]
; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
; RV64-UF2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 [[TMP22]], 4294967292
+; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], 4
+; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 [[TMP24]], -1
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
; RV64-UF2-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
; RV64-UF2-NEXT: [[TMP29:%.*]] = mul i32 [[TMP25]], 4
@@ -226,7 +227,8 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
; RV64-UF2-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 [[TMP33]], 4294967292
+; RV64-UF2-NEXT: [[TMP44:%.*]] = mul i64 [[TMP33]], 4
+; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 [[TMP44]], -1
; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP37]]
; RV64-UF2-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i32 [[TMP36]], 4
@@ -483,7 +485,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]]
; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0
; RV64-UF2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 [[TMP22]], 4294967292
+; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], 4
+; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 [[TMP24]], -1
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
; RV64-UF2-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
; RV64-UF2-NEXT: [[TMP29:%.*]] = mul i32 [[TMP25]], 4
@@ -496,7 +499,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i32 0
; RV64-UF2-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 [[TMP33]], 4294967292
+; RV64-UF2-NEXT: [[TMP44:%.*]] = mul i64 [[TMP33]], 4
+; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 [[TMP44]], -1
; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP37]]
; RV64-UF2-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i32 [[TMP36]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 4e862bf2f7480..c5f4009157a33 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -37,10 +37,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -53,7 +53,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
@@ -73,15 +72,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
@@ -112,10 +111,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -139,7 +138,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 32
+; CHECK-NEXT: LV: Loop cost is 22
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -148,8 +147,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
+; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
; CHECK-EMPTY:
@@ -196,16 +194,16 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -286,10 +284,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -302,7 +300,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
@@ -322,15 +319,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
@@ -361,10 +358,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -388,7 +385,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 34
+; CHECK-NEXT: LV: Loop cost is 24
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -397,8 +394,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
+; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
; CHECK-EMPTY:
@@ -445,16 +441,16 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
>From c54369eefd5c08e4d9dd7b4ae40b510e5881938c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 10 Mar 2025 01:27:34 -0700
Subject: [PATCH 11/11] NFC: Add const, and check the legal function.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 92c69f7a6d16d..268b64d98683e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1315,10 +1315,14 @@ class LoopVectorizationCostModel {
/// Returns true if the target machine can represent \p V as a strided load
/// or store operation.
- bool isLegalStridedLoadStore(Value *V, ElementCount VF) {
+ bool isLegalStridedLoadStore(Value *V, ElementCount VF) const {
if (!isa<LoadInst, StoreInst>(V))
return false;
auto *Ty = getLoadStoreType(V);
+ Value *Ptr = getLoadStorePointerOperand(V);
+ // TODO: Support non-unit-reverse strided accesses.
+ if (Legal->isConsecutivePtr(Ty, Ptr) != -1)
+ return false;
Align Align = getLoadStoreAlignment(V);
if (VF.isVector())
Ty = VectorType::get(Ty, VF);
@@ -1659,7 +1663,8 @@ class LoopVectorizationCostModel {
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
/// The cost computation for strided load/store instruction.
- InstructionCost getStridedLoadStoreCost(Instruction *I, ElementCount VF);
+ InstructionCost getStridedLoadStoreCost(Instruction *I,
+ ElementCount VF) const;
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
@@ -5831,7 +5836,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
- ElementCount VF) {
+ ElementCount VF) const {
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
More information about the llvm-commits
mailing list