[llvm] [LV] Implement SWAR loop vectorization (PR #69306)
Sergey Kachkov via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 17 02:15:28 PDT 2023
https://github.com/skachkov-sc created https://github.com/llvm/llvm-project/pull/69306
Implement "SIMD within a register" (SWAR) loop vectorization. This technique can vectorize some loops on targets without vector registers. Currently supported instructions are:
1. Consecutive loads/stores
2. Bitwise operations (add/sub)
3. Shifts (shl, lshr) with constant 2nd operand
4. Addition/Subtraction
>From 03e46759124b526460bf2f6a10655577d4a96876 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Fri, 1 Sep 2023 15:28:18 +0300
Subject: [PATCH 1/2] [LV][NFC] Add pre-commit tests for SWAR vectorization
---
.../LoopVectorize/swar-vectorization.ll | 618 ++++++++++++++++++
1 file changed, 618 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
diff --git a/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll b/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
new file mode 100644
index 000000000000000..d3640dfe5d439d2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
@@ -0,0 +1,618 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -S 2>%t | FileCheck %s
+
+; Tests for SWAR (SIMD within a register) vectorization
+
+define void @test1(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %0, ptr %arrayidx2, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test2(ptr writeonly %dst1, ptr writeonly %dst2, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP13]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp13 = icmp sgt i32 %N, 0
+ br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %dst1, i64 %indvars.iv
+ store i8 %0, ptr %arrayidx2, align 1
+ %arrayidx6 = getelementptr inbounds i8, ptr %dst2, i64 %indvars.iv
+ store i8 %0, ptr %arrayidx6, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test3(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], 66
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %N, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = and i8 %0, 66
+ %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %1, ptr %arrayidx3, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test4(ptr writeonly %dst, ptr readonly %src1, i8 zeroext %src2, i32 noundef signext %N) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[OR8:%.*]] = or i8 [[TMP0]], [[SRC2:%.*]]
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[OR8]], ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %or8 = or i8 %0, %src2
+ %arrayidx4 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %or8, ptr %arrayidx4, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test5(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[XOR11:%.*]] = xor i8 [[TMP1]], [[TMP0]]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[XOR11]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp12 = icmp sgt i32 %N, 0
+ br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %src2, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx2, align 1
+ %xor11 = xor i8 %1, %0
+ %arrayidx6 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %xor11, ptr %arrayidx6, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test6(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[TMP0]], 1
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[SHL]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %N, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %shl = shl i8 %0, 1
+ %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %shl, ptr %arrayidx3, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test7(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 [[TMP0]], 2
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %N, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = lshr i8 %0, 2
+ %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %1, ptr %arrayidx3, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test8(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %src2, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx2, align 1
+ %add = add i8 %1, %0
+ %arrayidx6 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %add, ptr %arrayidx6, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test9(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[SUB]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %src2, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx2, align 1
+ %sub = sub i8 %0, %1
+ %arrayidx6 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %sub, ptr %arrayidx6, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define zeroext i8 @test_reduction_or(ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test_reduction_or(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i8 [ [[OR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret i8 [[RES_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OR]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[OR]] = or i8 [[TMP0]], [[RES]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %N, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ %or.lcssa = phi i8 [ %or, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ %res.lcssa = phi i8 [ 0, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ]
+ ret i8 %res.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %res = phi i8 [ 0, %for.body.preheader ], [ %or, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %or = or i8 %0, %res
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define zeroext i8 @test_reduction_add(ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test_reduction_add(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i8 [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret i8 [[RES_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ADD]] = or i8 [[TMP0]], [[RES]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %N, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ %add.lcssa = phi i8 [ %add, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ %res.lcssa = phi i8 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+ ret i8 %res.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %res = phi i8 [ 0, %for.body.preheader ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %add = or i8 %0, %res
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test_negative(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, ptr readonly %src3, i32 signext %N) {
+; CHECK-LABEL: @test_negative(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP1]], [[TMP0]]
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[SRC3:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP2]]
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX9]], align 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %src2, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx2, align 1
+ %mul = mul i8 %1, %0
+ %arrayidx5 = getelementptr inbounds i8, ptr %src3, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx5, align 1
+ %add = add i8 %mul, %2
+ %arrayidx9 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %add, ptr %arrayidx9, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
>From bf50cde05f9ed8c00d55c9f62f391afaa0da4b97 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Mon, 28 Aug 2023 17:26:53 +0300
Subject: [PATCH 2/2] [LV] Implement SWAR loop vectorization
Implement "SIMD within a register" (SWAR) loop vectorization. This
technique can vectorize some loops on targets without vector registers.
Currently supported instructions are:
1. Consecutive loads/stores
2. Bitwise operations (add/sub)
3. Shifts (shl, lshr) with constant 2nd operand
4. Addition/Subtraction
---
.../Transforms/Vectorize/LoopVectorize.cpp | 240 +++++-
.../Transforms/Vectorize/VPRecipeBuilder.h | 4 +
llvm/lib/Transforms/Vectorize/VPlan.h | 72 ++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 122 +++
llvm/lib/Transforms/Vectorize/VPlanValue.h | 3 +
.../LoopVectorize/swar-vectorization.ll | 694 ++++++++++++++++--
6 files changed, 1037 insertions(+), 98 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2ca7e75f97f0f02..153750035b7b045 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,10 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+static cl::opt<bool> EnableSWARVectorization(
+ "enable-swar-vectorization", cl::init(false), cl::Hidden,
+ cl::desc("Enable SWAR (SIMD within a register) vectorization"));
+
static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
cl::desc("Enable vectorization of epilogue loops."));
@@ -1203,10 +1207,10 @@ class LoopVectorizationCostModel {
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints,
- InterleavedAccessInfo &IAI)
+ InterleavedAccessInfo &IAI, bool UseSWAR)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {}
+ Hints(Hints), InterleaveInfo(IAI), UseSWAR(UseSWAR) {}
/// \return An upper bound for the vectorization factors (both fixed and
/// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1712,6 +1716,9 @@ class LoopVectorizationCostModel {
/// of elements.
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
+ /// Calculate cost of SWAR instruction.
+ InstructionCost getSWARInstructionCost(Instruction *I, unsigned VF);
+
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
@@ -1921,6 +1928,9 @@ class LoopVectorizationCostModel {
/// All element types found in the loop.
SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+ /// Use SWAR vectorization mode.
+ const bool UseSWAR;
};
} // end namespace llvm
@@ -5071,9 +5081,11 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
ElementCount MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
- const TypeSize WidestRegister = TTI.getRegisterBitWidth(
+ const TargetTransformInfo::RegisterKind RegKind =
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
- : TargetTransformInfo::RGK_FixedWidthVector);
+ : UseSWAR ? TargetTransformInfo::RGK_Scalar
+ : TargetTransformInfo::RGK_FixedWidthVector;
+ const TypeSize WidestRegister = TTI.getRegisterBitWidth(RegKind);
// Convenience function to return the minimum of two ElementCounts.
auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
@@ -5128,9 +5140,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
}
- TargetTransformInfo::RegisterKind RegKind =
- ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
- : TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
TTI.shouldMaximizeVectorBandwidth(RegKind))) {
@@ -6684,6 +6693,65 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
return getWideningCost(I, VF);
}
+InstructionCost
+LoopVectorizationCostModel::getSWARInstructionCost(Instruction *I,
+ unsigned VF) {
+ uint64_t RegSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedValue();
+ auto *RegType = IntegerType::get(I->getModule()->getContext(), RegSize);
+ auto GetMultiplier = [&](IntegerType *Ty) -> uint64_t {
+ return divideCeil(Ty->getBitWidth() * VF, RegSize);
+ };
+ if (isa<LoadInst, StoreInst>(I)) {
+ if (getWideningDecision(I, ElementCount::getFixed(VF)) !=
+ LoopVectorizationCostModel::CM_Widen)
+ return InstructionCost::getInvalid();
+ auto *ValTy = dyn_cast<IntegerType>(getLoadStoreType(I));
+ if (!ValTy)
+ return InstructionCost::getInvalid();
+ const auto &DL = I->getModule()->getDataLayout();
+ const Align Alignment = DL.getPrefTypeAlign(RegType);
+ unsigned AddressSpace =
+ getLoadStorePointerOperand(I)->getType()->getPointerAddressSpace();
+ return GetMultiplier(ValTy) * TTI.getMemoryOpCost(I->getOpcode(), RegType,
+ Alignment, AddressSpace);
+ }
+ auto *ValTy = dyn_cast<IntegerType>(I->getType());
+ if (!ValTy)
+ return InstructionCost::getInvalid();
+ if (auto *PN = dyn_cast<PHINode>(I))
+ if (Legal->isReductionVariable(PN))
+ return TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+ auto Multiplier = GetMultiplier(ValTy);
+ if (I->isBitwiseLogicOp())
+ return Multiplier * TTI.getArithmeticInstrCost(I->getOpcode(), RegType);
+ switch (I->getOpcode()) {
+ case Instruction::Shl:
+ case Instruction::LShr:
+ // Shl: (LHS << ShiftAmnt) & Mask
+ // LShr: (LHS >> ShiftAmnt) & Mask
+ if (!isa<ConstantInt>(I->getOperand(1)))
+ return InstructionCost::getInvalid();
+ return Multiplier * (TTI.getArithmeticInstrCost(I->getOpcode(), RegType) +
+ TTI.getArithmeticInstrCost(Instruction::And, RegType));
+ case Instruction::Add:
+ // Add: ((LHS & ~Mask) + (RHS & ~Mask)) ^ ((LHS ^ RHS) & Mask)
+ return Multiplier *
+ (TTI.getArithmeticInstrCost(Instruction::Add, RegType) +
+ 2 * TTI.getArithmeticInstrCost(Instruction::Xor, RegType) +
+ 3 * TTI.getArithmeticInstrCost(Instruction::And, RegType));
+ case Instruction::Sub:
+ // Sub: ((LHS | Mask) - (RHS & ~Mask)) ^ ((LHS ^ ~RHS) & Mask)
+ return Multiplier *
+ (TTI.getArithmeticInstrCost(Instruction::Sub, RegType) +
+ TTI.getArithmeticInstrCost(Instruction::Or, RegType) +
+ 2 * TTI.getArithmeticInstrCost(Instruction::And, RegType) +
+ 3 * TTI.getArithmeticInstrCost(Instruction::Xor, RegType));
+ default:
+ return InstructionCost::getInvalid();
+ }
+}
+
LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ElementCount VF) {
@@ -6706,6 +6774,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
false);
}
+ if (UseSWAR && VF.isVector()) {
+ assert(!VF.isScalable() && "Scalable VF not supported");
+ if (!I->isTerminator())
+ return VectorizationCostTy(getSWARInstructionCost(I, VF.getFixedValue()),
+ true);
+ }
+
Type *VectorTy;
InstructionCost C = getInstructionCost(I, VF, VectorTy);
@@ -8208,6 +8283,23 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
return BlockMaskCache[BB] = BlockMask;
}
+VPRecipeBase *VPRecipeBuilder::tryToSWARMemory(Instruction *I,
+ ArrayRef<VPValue *> Operands,
+ VFRange &Range) {
+ if (Legal->isMaskRequired(I))
+ return nullptr;
+ if (CM.getWideningDecision(I, Range.Start) !=
+ LoopVectorizationCostModel::CM_Widen)
+ return nullptr;
+ if (!isa<IntegerType>(getLoadStoreType(I)))
+ return nullptr;
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ return new VPSWARMemoryInstructionRecipe(*LI, Operands[0]);
+ if (auto *SI = dyn_cast<StoreInst>(I))
+ return new VPSWARMemoryInstructionRecipe(*SI, Operands[1], Operands[0]);
+ llvm_unreachable("Unhandled instruction!");
+}
+
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
ArrayRef<VPValue *> Operands,
VFRange &Range,
@@ -8474,6 +8566,25 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
Range);
}
+VPRecipeBase *VPRecipeBuilder::tryToSWAR(Instruction *I,
+ ArrayRef<VPValue *> Operands) {
+ switch (I->getOpcode()) {
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Add:
+ case Instruction::Sub:
+ return new VPSWARRecipe(*I, make_range(Operands.begin(), Operands.end()));
+ case Instruction::Shl:
+ case Instruction::LShr:
+ if (!isa<ConstantInt>(I->getOperand(1)))
+ return nullptr;
+ return new VPSWARRecipe(*I, make_range(Operands.begin(), Operands.end()));
+ default:
+ return nullptr;
+ }
+}
+
VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
ArrayRef<VPValue *> Operands,
VPBasicBlock *VPBB, VPlanPtr &Plan) {
@@ -8656,7 +8767,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
- return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
+ return toVPRecipeResult(
+ CM.UseSWAR ? tryToSWARMemory(Instr, Operands, Range)
+ : tryToWidenMemory(Instr, Operands, Range, Plan));
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8675,7 +8788,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));
}
- return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
+ return toVPRecipeResult(CM.UseSWAR ? tryToSWAR(Instr, Operands)
+ : tryToWiden(Instr, Operands, VPBB, Plan));
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9117,7 +9231,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
"must be a select recipe");
IndexOfFirstOperand = 1;
} else {
- assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
+ assert((MinVF.isScalar() || CM.UseSWAR ||
+ isa<VPWidenRecipe>(CurrentLink)) &&
"Expected to replace a VPWidenSC");
IndexOfFirstOperand = 0;
}
@@ -9454,6 +9569,49 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
}
+static Type *getSWARType(Type *ScalarTy, ElementCount VF) {
+ assert(isa<IntegerType>(ScalarTy));
+ unsigned ScalarBitWidth = cast<IntegerType>(ScalarTy)->getBitWidth();
+ assert(!VF.isScalable() && "Scalable VF not supported");
+ return IntegerType::get(ScalarTy->getContext(),
+ ScalarBitWidth * VF.getFixedValue());
+}
+
+void VPSWARMemoryInstructionRecipe::execute(VPTransformState &State) {
+ auto VF = State.VF;
+ Value *Ptr = State.get(getAddr(), VPIteration(0, 0));
+ bool InBounds = false;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = GEP->isInBounds();
+ Type *ScalarTy = getLoadStoreType(&Ingredient);
+ Type *SWARTy = getSWARType(ScalarTy, VF);
+ Type *VecTy = VectorType::get(ScalarTy, VF);
+ const auto &DL = Ingredient.getModule()->getDataLayout();
+ const Align Alignment = DL.getPrefTypeAlign(SWARTy);
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(Ingredient.getDebugLoc());
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *GEP = Builder.CreateGEP(ScalarTy, Ptr,
+ Builder.getInt32(VF.getFixedValue() * Part),
+ Ptr->getName() + ".swar", InBounds);
+ Value *SWARPtr = Builder.CreateBitCast(
+ GEP, SWARTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()));
+ Instruction *Res = nullptr;
+ if (isa<LoadInst>(Ingredient)) {
+ Res = Builder.CreateAlignedLoad(SWARTy, SWARPtr, Alignment,
+ Ingredient.getName() + ".swar");
+ State.set(getVPSingleValue(), Builder.CreateBitCast(Res, VecTy), Part);
+ } else if (isa<StoreInst>(Ingredient))
+ Res = Builder.CreateAlignedStore(
+ Builder.CreateBitCast(State.get(getStoredValue(), Part), SWARTy),
+ SWARPtr, Alignment);
+ else
+ llvm_unreachable("Unhandled instruction!");
+ State.addMetadata(Res, &Ingredient);
+ }
+}
+
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
@@ -9643,7 +9801,7 @@ static bool processLoopInVPlanNativePath(
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
- LoopVectorizationRequirements &Requirements) {
+ LoopVectorizationRequirements &Requirements, bool UseSWAR) {
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
@@ -9657,7 +9815,7 @@ static bool processLoopInVPlanNativePath(
getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints, IAI);
+ &Hints, IAI, UseSWAR);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
@@ -9841,6 +9999,45 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
return true;
}
+static const SCEVPredicate *getAlignPredicate(ScalarEvolution *SE,
+ const DataLayout &DL,
+ const SCEV *Start,
+ Align Alignment) {
+ Type *IntTy = DL.getIntPtrType(Start->getType());
+ const SCEV *Rem = SE->getURemExpr(SE->getPtrToIntExpr(Start, IntTy),
+ SE->getConstant(IntTy, Alignment.value()));
+ if (Rem->isZero())
+ return nullptr;
+ return SE->getEqualPredicate(Rem, SE->getZero(IntTy));
+}
+
+static void generateAlignChecks(PredicatedScalarEvolution &PSE,
+ const VPlan &Plan, ElementCount VF) {
+ ScalarEvolution *SE = PSE.getSE();
+ const DataLayout &DL = SE->getDataLayout();
+ MapVector<const SCEV *, Align> Checks;
+ for (const auto *VPBlock : vp_depth_first_shallow(Plan.getEntry()))
+ for (const auto &Recipe : *VPBlock->getEntryBasicBlock()) {
+ auto *SWARRecipe = dyn_cast<VPSWARMemoryInstructionRecipe>(&Recipe);
+ if (!SWARRecipe)
+ continue;
+ auto &MemInst = SWARRecipe->getIngredient();
+ const SCEVAddRecExpr *PtrSCEV =
+ PSE.getAsAddRec(getLoadStorePointerOperand(&MemInst));
+ assert(PtrSCEV && "Consecutive Ptr expected");
+ const SCEV *Start = PtrSCEV->getStart();
+ Type *SWARTy = getSWARType(getLoadStoreType(&MemInst), VF);
+ Align Alignment = DL.getPrefTypeAlign(SWARTy);
+ if (auto It = Checks.find(Start); It != Checks.end())
+ It->second = std::max(It->second, Alignment);
+ else
+ Checks.insert({Start, Alignment});
+ }
+ for (auto [Start, Alignment] : Checks)
+ if (auto *Predicate = getAlignPredicate(SE, DL, Start, Alignment))
+ PSE.addPredicate(*Predicate);
+}
+
LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
!EnableLoopInterleaving),
@@ -9905,9 +10102,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// even evaluating whether vectorization is profitable. Since we cannot modify
// the incoming IR, we need to build VPlan upfront in the vectorization
// pipeline.
+ bool UseSWAR =
+ !TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+ EnableSWARVectorization;
if (!L->isInnermost())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
- ORE, BFI, PSI, Hints, Requirements);
+ ORE, BFI, PSI, Hints, Requirements,
+ UseSWAR);
assert(L->isInnermost() && "Inner loop expected.");
@@ -10001,7 +10202,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
- F, &Hints, IAI);
+ F, &Hints, IAI, UseSWAR);
// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
ORE);
@@ -10026,8 +10227,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned SelectedIC = std::max(IC, UserIC);
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
- if (VF.Width.isVector() || SelectedIC > 1)
+ if (VF.Width.isVector() || SelectedIC > 1) {
+ if (UseSWAR)
+ generateAlignChecks(PSE, LVP.getBestPlanFor(VF.Width), VF.Width);
Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+ }
// Check if it is profitable to vectorize with runtime checks.
bool ForceVectorization =
@@ -10299,12 +10503,14 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
// Don't attempt if
// 1. the target claims to have no vector registers, and
- // 2. interleaving won't help ILP.
+ // 2. SWAR vectorization is disabled, and
+ // 3. interleaving won't help ILP.
//
- // The second condition is necessary because, even if the target has no
+ // The last condition is necessary because, even if the target has no
// vector registers, loop vectorization may still enable scalar
// interleaving.
if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+ !EnableSWARVectorization &&
TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
return LoopVectorizeResult(false, false);
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 7ff6749a09089e9..7369de4320cddd8 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -103,6 +103,10 @@ class VPRecipeBuilder {
VPRecipeBase *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
VPBasicBlock *VPBB, VPlanPtr &Plan);
+ VPRecipeBase *tryToSWARMemory(Instruction *I, ArrayRef<VPValue *> Operands,
+ VFRange &Range);
+ VPRecipeBase *tryToSWAR(Instruction *I, ArrayRef<VPValue *> Operands);
+
/// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.
VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e65a7ab2cd028ee..4c29e843401e82c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1148,6 +1148,32 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
}
};
+// VPSWARRecipe is a recipe for producing a SIMD-within-a-register (SWAR)
+// operation for its ingredient. The operation works on values that are packed
+// into scalar registers. This recipe covers the following cases:
+// 1. Bitwise operations (and, or, xor)
+// 2. Shifts (shl, lshr) with constant second operand
+// 3. Add/Sub operations.
+class VPSWARRecipe : public VPRecipeBase, public VPValue {
+public:
+ template <typename IterT>
+ VPSWARRecipe(Instruction &I, iterator_range<IterT> Operands)
+ : VPRecipeBase(VPRecipeBase::VPSWARSC, Operands), VPValue(this, &I) {}
+
+ ~VPSWARRecipe() override = default;
+
+ VP_CLASSOF_IMPL(VPDef::VPSWARSC)
+
+ /// Generate the SWAR operation.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// VPWidenRecipe is a recipe for producing a copy of vector type its
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
@@ -1929,6 +1955,52 @@ class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
}
};
+// A recipe for SWAR (SIMD-wthin-a-register) load/store operations.
+class VPSWARMemoryInstructionRecipe : public VPRecipeBase {
+ Instruction &Ingredient;
+
+public:
+ VPSWARMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr)
+ : VPRecipeBase(VPSWARMemoryInstructionSC, {Addr}), Ingredient(Load) {
+ new VPValue(this, &Load);
+ }
+
+ VPSWARMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
+ VPValue *StoredValue)
+ : VPRecipeBase(VPSWARMemoryInstructionSC, {Addr, StoredValue}),
+ Ingredient(Store) {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPSWARMemoryInstructionSC;
+ }
+
+ /// Return the address accessed by this recipe.
+ VPValue *getAddr() const {
+ return getOperand(0); // Address is the 1st, mandatory operand.
+ }
+
+ /// Returns true if this recipe is a store.
+ bool isStore() const { return isa<StoreInst>(Ingredient); }
+
+ /// Return the address accessed by this recipe.
+ VPValue *getStoredValue() const {
+ assert(isStore() && "Stored value only available for store instructions");
+ return getOperand(1); // Stored value is the 2nd, mandatory operand.
+ }
+
+ /// Generate the SWAR load/store.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ Instruction &getIngredient() const { return Ingredient; }
+};
+
/// A Recipe for widening load/store operations.
/// The recipe uses the following VPValues:
/// - For load: Address, optional mask
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a1213a98095907..767feef39de4cc9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -43,6 +43,9 @@ extern cl::opt<bool> EnableVPlanNativePath;
bool VPRecipeBase::mayWriteToMemory() const {
switch (getVPDefID()) {
+ case VPSWARMemoryInstructionSC: {
+ return cast<VPSWARMemoryInstructionRecipe>(this)->isStore();
+ }
case VPWidenMemoryInstructionSC: {
return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
}
@@ -56,6 +59,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
return false;
case VPBlendSC:
case VPReductionSC:
+ case VPSWARSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
case VPWidenGEPSC:
@@ -77,6 +81,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
bool VPRecipeBase::mayReadFromMemory() const {
switch (getVPDefID()) {
+ case VPSWARMemoryInstructionSC: {
+ return !cast<VPSWARMemoryInstructionRecipe>(this)->isStore();
+ }
case VPWidenMemoryInstructionSC: {
return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
}
@@ -90,6 +97,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return false;
case VPBlendSC:
case VPReductionSC:
+ case VPSWARSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
case VPWidenGEPSC:
@@ -130,6 +138,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
->mayHaveSideEffects();
case VPBlendSC:
case VPReductionSC:
+ case VPSWARSC:
case VPScalarIVStepsSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -146,6 +155,13 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPSWARMemoryInstructionSC:
+ assert(cast<VPSWARMemoryInstructionRecipe>(this)
+ ->getIngredient()
+ .mayHaveSideEffects() == mayWriteToMemory() &&
+ "mayHaveSideffects result for ingredient differs from this "
+ "implementation");
+ return mayWriteToMemory();
case VPWidenMemoryInstructionSC:
assert(cast<VPWidenMemoryInstructionRecipe>(this)
->getIngredient()
@@ -496,6 +512,112 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
}
#endif
+static Value *SWARBinOp(IRBuilderBase &B, BinaryOperator *BinOp, Value *LHS,
+ Value *RHS, unsigned ScalarBitWidth,
+ unsigned SWARBitWidth) {
+ auto Opc = BinOp->getOpcode();
+ Twine Name = BinOp->getName() + ".swar";
+ switch (Opc) {
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return B.CreateBinOp(Opc, LHS, RHS, Name);
+ case Instruction::Shl:
+ case Instruction::LShr: {
+ // Shl: (LHS << ShiftAmnt) & Mask
+ // LShr: (LHS >> ShiftAmnt) & Mask
+ // Mask: splat of scalar all-ones value shifted on ShiftAmnt, e.g. for
+ // bytewise operations:
+ // Shl on 2: 0xfcfcfcfcfcfcfcfc
+ // LShr on 3: 0x1f1f1f1f1f1f1f1f
+ assert(isa<ConstantExpr>(RHS));
+ Value *ShiftAmntSplat = cast<ConstantExpr>(RHS)->getOperand(0);
+ assert(isa<Constant>(ShiftAmntSplat));
+ unsigned ShiftAmnt =
+ cast<Constant>(ShiftAmntSplat)->getUniqueInteger().getLimitedValue();
+ auto Mask = APInt::getAllOnes(ScalarBitWidth);
+ if (Opc == Instruction::Shl)
+ Mask = Mask.shl(ShiftAmnt);
+ else
+ Mask = Mask.lshr(ShiftAmnt);
+ Value *Res = Opc == Instruction::Shl ? B.CreateShl(LHS, ShiftAmnt, Name)
+ : B.CreateLShr(LHS, ShiftAmnt, Name);
+ return B.CreateAnd(Res, B.getInt(APInt::getSplat(SWARBitWidth, Mask)));
+ }
+ case Instruction::Add:
+ case Instruction::Sub: {
+ // Add: ((LHS & ~Mask) + (RHS & ~Mask)) ^ ((LHS ^ RHS) & Mask)
+ // Sub: ((LHS | Mask) - (RHS & ~Mask)) ^ ((LHS ^ ~RHS) & Mask)
+ // Mask: MSB set in each element, e.g. for bytewise math in 64-bit register:
+ // Mask = 0x8080808080808080
+ auto MaskVal =
+ APInt::getSplat(SWARBitWidth, APInt::getSignMask(ScalarBitWidth));
+ Value *Mask = B.getInt(MaskVal);
+ Value *InvMask = B.CreateNot(Mask);
+ Value *Res = Opc == Instruction::Add
+ ? B.CreateAdd(B.CreateAnd(LHS, InvMask),
+ B.CreateAnd(RHS, InvMask), Name)
+ : B.CreateSub(B.CreateOr(LHS, Mask),
+ B.CreateAnd(RHS, InvMask), Name);
+ return B.CreateXor(
+ Res,
+ B.CreateAnd(
+ B.CreateXor(LHS, Opc == Instruction::Add ? RHS : B.CreateNot(RHS)),
+ Mask));
+ }
+ default:
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << *BinOp);
+ llvm_unreachable("Unhandled instruction!");
+ }
+}
+
+void VPSWARRecipe::execute(VPTransformState &State) {
+ assert(isa<BinaryOperator>(getUnderlyingValue()));
+ auto *BinOp = cast<BinaryOperator>(getUnderlyingValue());
+ Type *ScalarTy = BinOp->getType();
+ assert(isa<IntegerType>(ScalarTy));
+ unsigned ScalarBitWidth = cast<IntegerType>(ScalarTy)->getBitWidth();
+ assert(!State.VF.isScalable() && "Scalable VF not supported");
+ unsigned VF = State.VF.getFixedValue();
+ Type *SWARTy = IntegerType::get(ScalarTy->getContext(), ScalarBitWidth * VF);
+ Type *VecTy = FixedVectorType::get(ScalarTy, VF);
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(BinOp->getDebugLoc());
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *LHS = Builder.CreateBitCast(State.get(getOperand(0), Part), SWARTy);
+ Value *RHS = Builder.CreateBitCast(State.get(getOperand(1), Part), SWARTy);
+ Value *Res = SWARBinOp(Builder, BinOp, LHS, RHS, ScalarBitWidth,
+ ScalarBitWidth * VF);
+ State.set(this, Builder.CreateBitCast(Res, VecTy), Part);
+ State.addMetadata(Res, BinOp);
+ }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPSWARRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "SWAR ";
+ printAsOperand(O, SlotTracker);
+ const Instruction *UI = getUnderlyingInstr();
+ O << " = " << UI->getOpcodeName() << " ";
+ printOperands(O, SlotTracker);
+}
+
+void VPSWARMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "SWAR ";
+
+ if (!isStore()) {
+ getVPSingleValue()->printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+ O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
+
+ printOperands(O, SlotTracker);
+}
+#endif
+
void VPWidenCallRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
auto &CI = *cast<CallInst>(getUnderlyingInstr());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index ac110bb3b0ef9be..1a82dfcd40d5c1a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -50,6 +50,7 @@ class VPValue {
friend class VPInterleavedAccessInfo;
friend class VPSlotTracker;
friend class VPRecipeBase;
+ friend class VPSWARMemoryInstructionRecipe;
friend class VPWidenMemoryInstructionRecipe;
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -342,6 +343,8 @@ class VPDef {
VPInterleaveSC,
VPReductionSC,
VPReplicateSC,
+ VPSWARSC,
+ VPSWARMemoryInstructionSC,
VPScalarIVStepsSC,
VPWidenCallSC,
VPWidenCanonicalIVSC,
diff --git a/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll b/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
index d3640dfe5d439d2..857e525b839c68a 100644
--- a/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
@@ -1,29 +1,70 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -S 2>%t | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -enable-swar-vectorization -mtriple riscv64 -S 2>%t | FileCheck %s
; Tests for SWAR (SIMD within a register) vectorization
define void @test1(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
; CHECK-LABEL: @test1(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC4:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT: [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 24
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC4]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR6:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT: store i64 [[TMP10]], ptr [[DOTSWAR6]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP12]], ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
entry:
%cmp6 = icmp sgt i32 %N, 0
@@ -53,26 +94,83 @@ for.body:
define void @test2(ptr writeonly %dst1, ptr writeonly %dst2, ptr readonly %src, i32 signext %N) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC7:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT: [[DST16:%.*]] = ptrtoint ptr [[DST1:%.*]] to i64
+; CHECK-NEXT: [[DST24:%.*]] = ptrtoint ptr [[DST2:%.*]] to i64
+; CHECK-NEXT: [[DST12:%.*]] = ptrtoint ptr [[DST1]] to i64
+; CHECK-NEXT: [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
; CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP13]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[DST12]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[DST24]] to i3
+; CHECK-NEXT: [[TMP5:%.*]] = zext i3 [[TMP4]] to i64
+; CHECK-NEXT: [[IDENT_CHECK5:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK5]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[DST24]], [[DST16]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], 8
+; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[DST16]], [[SRC7]]
+; CHECK-NEXT: [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 8
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK8]]
+; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[DST24]], [[SRC7]]
+; CHECK-NEXT: [[DIFF_CHECK9:%.*]] = icmp ult i64 [[TMP10]], 8
+; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK9]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX10]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP11]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT: [[DOTSWAR11:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[DOTSWAR11]] to <8 x i8>
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[TMP11]]
+; CHECK-NEXT: [[DOTSWAR12:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
+; CHECK-NEXT: store i64 [[TMP15]], ptr [[DOTSWAR12]], align 8
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[DST2]], i64 [[TMP11]]
+; CHECK-NEXT: [[DOTSWAR13:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
+; CHECK-NEXT: store i64 [[TMP17]], ptr [[DOTSWAR13]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP19]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP19]], ptr [[ARRAYIDX6]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
entry:
%cmp13 = icmp sgt i32 %N, 0
@@ -104,25 +202,69 @@ for.body:
define void @test3(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
; CHECK-LABEL: @test3(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC4:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT: [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC4]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT: [[DOTSWAR6:%.*]] = and i64 [[TMP9]], bitcast (<8 x i8> <i8 66, i8 66, i8 66, i8 66, i8 66, i8 66, i8 66, i8 66> to i64)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[DOTSWAR6]] to <8 x i8>
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR7:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP10]] to i64
+; CHECK-NEXT: store i64 [[TMP12]], ptr [[DOTSWAR7]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], 66
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP15:%.*]] = and i8 [[TMP14]], 66
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP15]], ptr [[ARRAYIDX3]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
entry:
%cmp7 = icmp sgt i32 %N, 0
@@ -153,25 +295,72 @@ for.body:
define void @test4(ptr writeonly %dst, ptr readonly %src1, i8 zeroext %src2, i32 noundef signext %N) {
; CHECK-LABEL: @test4(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC14:%.*]] = ptrtoint ptr [[SRC1:%.*]] to i64
+; CHECK-NEXT: [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[SRC11:%.*]] = ptrtoint ptr [[SRC1]] to i64
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC11]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC14]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[SRC2:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[BROADCAST_SPLAT]] to i64
+; CHECK-NEXT: [[OR8_SWAR:%.*]] = or i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[OR8_SWAR]] to <8 x i8>
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR6:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT: store i64 [[TMP13]], ptr [[DOTSWAR6]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[OR8:%.*]] = or i8 [[TMP0]], [[SRC2:%.*]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[OR8:%.*]] = or i8 [[TMP15]], [[SRC2]]
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 [[OR8]], ptr [[ARRAYIDX4]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
@@ -202,27 +391,85 @@ for.body:
define void @test5(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
; CHECK-LABEL: @test5(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC27:%.*]] = ptrtoint ptr [[SRC2:%.*]] to i64
+; CHECK-NEXT: [[SRC16:%.*]] = ptrtoint ptr [[SRC1:%.*]] to i64
+; CHECK-NEXT: [[DST4:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[SRC22:%.*]] = ptrtoint ptr [[SRC2]] to i64
+; CHECK-NEXT: [[SRC11:%.*]] = ptrtoint ptr [[SRC1]] to i64
; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 24
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC11]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[SRC22]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[DST4]] to i3
+; CHECK-NEXT: [[TMP5:%.*]] = zext i3 [[TMP4]] to i64
+; CHECK-NEXT: [[IDENT_CHECK5:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK5]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[DST4]], [[SRC16]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], 8
+; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[DST4]], [[SRC27]]
+; CHECK-NEXT: [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 8
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK8]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT: [[DOTSWAR9:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64 [[DOTSWAR9]] to <8 x i8>
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR10:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT: [[DOTSWAR11:%.*]] = load i64, ptr [[DOTSWAR10]], align 8
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[DOTSWAR11]] to <8 x i8>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
+; CHECK-NEXT: [[XOR11_SWAR:%.*]] = xor i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64 [[XOR11_SWAR]] to <8 x i8>
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR12:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP17]] to i64
+; CHECK-NEXT: store i64 [[TMP19]], ptr [[DOTSWAR12]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[XOR11:%.*]] = xor i8 [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[XOR11:%.*]] = xor i8 [[TMP22]], [[TMP21]]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 [[XOR11]], ptr [[ARRAYIDX6]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
;
entry:
%cmp12 = icmp sgt i32 %N, 0
@@ -255,25 +502,70 @@ for.body:
define void @test6(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
; CHECK-LABEL: @test6(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC4:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT: [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC4]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT: [[SHL_SWAR:%.*]] = shl i64 [[TMP9]], 1
+; CHECK-NEXT: [[TMP10:%.*]] = and i64 [[SHL_SWAR]], -72340172838076674
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <8 x i8>
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR6:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT: store i64 [[TMP13]], ptr [[DOTSWAR6]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[TMP0]], 1
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 [[SHL]], ptr [[ARRAYIDX3]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
;
entry:
%cmp7 = icmp sgt i32 %N, 0
@@ -304,25 +596,70 @@ for.body:
define void @test7(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
; CHECK-LABEL: @test7(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC4:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT: [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC4]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT: [[DOTSWAR6:%.*]] = lshr i64 [[TMP9]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = and i64 [[DOTSWAR6]], 4557430888798830399
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <8 x i8>
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT: [[DOTSWAR7:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT: store i64 [[TMP13]], ptr [[DOTSWAR7]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 [[TMP0]], 2
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = lshr i8 [[TMP15]], 2
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP16]], ptr [[ARRAYIDX3]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
;
entry:
%cmp7 = icmp sgt i32 %N, 0
@@ -353,27 +690,90 @@ for.body:
define void @test8(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
; CHECK-LABEL: @test8(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC27:%.*]] = ptrtoint ptr [[SRC2:%.*]] to i64
+; CHECK-NEXT: [[SRC16:%.*]] = ptrtoint ptr [[SRC1:%.*]] to i64
+; CHECK-NEXT: [[DST4:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[SRC22:%.*]] = ptrtoint ptr [[SRC2]] to i64
+; CHECK-NEXT: [[SRC11:%.*]] = ptrtoint ptr [[SRC1]] to i64
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 24
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC11]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[SRC22]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[DST4]] to i3
+; CHECK-NEXT: [[TMP5:%.*]] = zext i3 [[TMP4]] to i64
+; CHECK-NEXT: [[IDENT_CHECK5:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK5]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[DST4]], [[SRC16]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], 8
+; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[DST4]], [[SRC27]]
+; CHECK-NEXT: [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 8
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK8]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT: [[DOTSWAR9:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64 [[DOTSWAR9]] to <8 x i8>
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR10:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT: [[DOTSWAR11:%.*]] = load i64, ptr [[DOTSWAR10]], align 8
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[DOTSWAR11]] to <8 x i8>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP15]], 9187201950435737471
+; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP16]], 9187201950435737471
+; CHECK-NEXT: [[ADD_SWAR:%.*]] = add i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = xor i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP20:%.*]] = and i64 [[TMP19]], -9187201950435737472
+; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[ADD_SWAR]], [[TMP20]]
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast i64 [[TMP21]] to <8 x i8>
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR12:%.*]] = getelementptr inbounds i8, ptr [[TMP23]], i32 0
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i8> [[TMP22]] to i64
+; CHECK-NEXT: store i64 [[TMP24]], ptr [[DOTSWAR12]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP27]], [[TMP26]]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
;
entry:
%cmp11 = icmp sgt i32 %N, 0
@@ -406,27 +806,91 @@ for.body:
define void @test9(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
; CHECK-LABEL: @test9(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC27:%.*]] = ptrtoint ptr [[SRC2:%.*]] to i64
+; CHECK-NEXT: [[SRC16:%.*]] = ptrtoint ptr [[SRC1:%.*]] to i64
+; CHECK-NEXT: [[DST4:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[SRC22:%.*]] = ptrtoint ptr [[SRC2]] to i64
+; CHECK-NEXT: [[SRC11:%.*]] = ptrtoint ptr [[SRC1]] to i64
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 24
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC11]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[SRC22]] to i3
+; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT: [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[DST4]] to i3
+; CHECK-NEXT: [[TMP5:%.*]] = zext i3 [[TMP4]] to i64
+; CHECK-NEXT: [[IDENT_CHECK5:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK5]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[DST4]], [[SRC16]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], 8
+; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[DST4]], [[SRC27]]
+; CHECK-NEXT: [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 8
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK8]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT: [[DOTSWAR9:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64 [[DOTSWAR9]] to <8 x i8>
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR10:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT: [[DOTSWAR11:%.*]] = load i64, ptr [[DOTSWAR10]], align 8
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[DOTSWAR11]] to <8 x i8>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP15]], -9187201950435737472
+; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP16]], 9187201950435737471
+; CHECK-NEXT: [[SUB_SWAR:%.*]] = sub i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = xor i64 [[TMP16]], -1
+; CHECK-NEXT: [[TMP20:%.*]] = xor i64 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], -9187201950435737472
+; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[SUB_SWAR]], [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = bitcast i64 [[TMP22]] to <8 x i8>
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT: [[DOTSWAR12:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0
+; CHECK-NEXT: [[TMP25:%.*]] = bitcast <8 x i8> [[TMP23]] to i64
+; CHECK-NEXT: store i64 [[TMP25]], ptr [[DOTSWAR12]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[TMP0]], [[TMP1]]
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 [[SUB]], ptr [[ARRAYIDX6]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
;
entry:
%cmp11 = icmp sgt i32 %N, 0
@@ -459,26 +923,60 @@ for.body:
define zeroext i8 @test_reduction_or(ptr readonly %src, i32 signext %N) {
; CHECK-LABEL: @test_reduction_or(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC1:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT: [[DOTSWAR2:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[DOTSWAR2]] to <8 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VEC_PHI]] to i64
+; CHECK-NEXT: [[OR_SWAR:%.*]] = or i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP7]] = bitcast i64 [[OR_SWAR]] to <8 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP7]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i8 [ [[OR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i8 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-NEXT: ret i8 [[RES_LCSSA]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[RES:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OR]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[OR]] = or i8 [[TMP0]], [[RES]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[OR]] = or i8 [[TMP10]], [[RES]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
;
entry:
%cmp7 = icmp sgt i32 %N, 0
@@ -510,26 +1008,60 @@ for.body:
define zeroext i8 @test_reduction_add(ptr readonly %src, i32 signext %N) {
; CHECK-LABEL: @test_reduction_add(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC1:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT: [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT: [[DOTSWAR2:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[DOTSWAR2]] to <8 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VEC_PHI]] to i64
+; CHECK-NEXT: [[ADD_SWAR:%.*]] = or i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP7]] = bitcast i64 [[ADD_SWAR]] to <8 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP7]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i8 [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i8 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-NEXT: ret i8 [[RES_LCSSA]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[RES:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ADD]] = or i8 [[TMP0]], [[RES]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ADD]] = or i8 [[TMP10]], [[RES]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
;
entry:
%cmp7 = icmp sgt i32 %N, 0
More information about the llvm-commits
mailing list