[llvm] [LV] Implement SWAR loop vectorization (PR #69306)

Tue Oct 17 02:15:28 PDT 2023

https://github.com/skachkov-sc created https://github.com/llvm/llvm-project/pull/69306

Implement "SIMD within a register" (SWAR) loop vectorization. This technique can vectorize some loops on targets without vector registers. Currently supported instructions are:
1. Consecutive loads/stores
2. Bitwise operations (add/sub)
3. Shifts (shl, lshr) with constant 2nd operand
4. Addition/Subtraction


>From 03e46759124b526460bf2f6a10655577d4a96876 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Fri, 1 Sep 2023 15:28:18 +0300
Subject: [PATCH 1/2] [LV][NFC] Add pre-commit tests for SWAR vectorization

---
 .../LoopVectorize/swar-vectorization.ll       | 618 ++++++++++++++++++
 1 file changed, 618 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/swar-vectorization.ll

diff --git a/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll b/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
new file mode 100644
index 000000000000000..d3640dfe5d439d2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
@@ -0,0 +1,618 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -S 2>%t | FileCheck %s
+
+; Tests for SWAR (SIMD within a register) vectorization
+
+define void @test1(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %0, ptr %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test2(ptr writeonly %dst1, ptr writeonly %dst2, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp13 = icmp sgt i32 %N, 0
+  br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %dst1, i64 %indvars.iv
+  store i8 %0, ptr %arrayidx2, align 1
+  %arrayidx6 = getelementptr inbounds i8, ptr %dst2, i64 %indvars.iv
+  store i8 %0, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test3(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[TMP0]], 66
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = and i8 %0, 66
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %1, ptr %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test4(ptr writeonly %dst, ptr readonly %src1, i8 zeroext %src2, i32 noundef signext %N) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[OR8:%.*]] = or i8 [[TMP0]], [[SRC2:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[OR8]], ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %or8 = or i8 %0, %src2
+  %arrayidx4 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %or8, ptr %arrayidx4, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test5(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[XOR11:%.*]] = xor i8 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[XOR11]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp12 = icmp sgt i32 %N, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %src2, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %xor11 = xor i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %xor11, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test6(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[SHL]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %shl = shl i8 %0, 1
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %shl, ptr %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test7(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 [[TMP0]], 2
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = lshr i8 %0, 2
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %1, ptr %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test8(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %N, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %src2, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test9(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[SUB]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %N, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %src2, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %sub = sub i8 %0, %1
+  %arrayidx6 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %sub, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define zeroext i8 @test_reduction_or(ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test_reduction_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i8 [ [[OR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RES_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OR]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[OR]] = or i8 [[TMP0]], [[RES]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %or.lcssa = phi i8 [ %or, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %res.lcssa = phi i8 [ 0, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ]
+  ret i8 %res.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %res = phi i8 [ 0, %for.body.preheader ], [ %or, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %or = or i8 %0, %res
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define zeroext i8 @test_reduction_add(ptr readonly %src, i32 signext %N) {
+; CHECK-LABEL: @test_reduction_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i8 [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RES_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ADD]] = or i8 [[TMP0]], [[RES]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %add.lcssa = phi i8 [ %add, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %res.lcssa = phi i8 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret i8 %res.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %res = phi i8 [ 0, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %add = or i8 %0, %res
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test_negative(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, ptr readonly %src3, i32 signext %N) {
+; CHECK-LABEL: @test_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[SRC3:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX9]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src1, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %src2, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %mul = mul i8 %1, %0
+  %arrayidx5 = getelementptr inbounds i8, ptr %src3, i64 %indvars.iv
+  %2 = load i8, ptr %arrayidx5, align 1
+  %add = add i8 %mul, %2
+  %arrayidx9 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx9, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

>From bf50cde05f9ed8c00d55c9f62f391afaa0da4b97 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Mon, 28 Aug 2023 17:26:53 +0300
Subject: [PATCH 2/2] [LV] Implement SWAR loop vectorization

Implement "SIMD within a register" (SWAR) loop vectorization. This
technique can vectorize some loops on targets without vector registers.
Currently supported instructions are:
1. Consecutive loads/stores
2. Bitwise operations (add/sub)
3. Shifts (shl, lshr) with constant 2nd operand
4. Addition/Subtraction
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 240 +++++-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |   4 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |  72 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 122 +++
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   3 +
 .../LoopVectorize/swar-vectorization.ll       | 694 ++++++++++++++++--
 6 files changed, 1037 insertions(+), 98 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2ca7e75f97f0f02..153750035b7b045 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,10 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
 
+static cl::opt<bool> EnableSWARVectorization(
+    "enable-swar-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable SWAR (SIMD within a register) vectorization"));
+
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
     cl::desc("Enable vectorization of epilogue loops."));
@@ -1203,10 +1207,10 @@ class LoopVectorizationCostModel {
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
                              const LoopVectorizeHints *Hints,
-                             InterleavedAccessInfo &IAI)
+                             InterleavedAccessInfo &IAI, bool UseSWAR)
       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
-        Hints(Hints), InterleaveInfo(IAI) {}
+        Hints(Hints), InterleaveInfo(IAI), UseSWAR(UseSWAR) {}
 
   /// \return An upper bound for the vectorization factors (both fixed and
   /// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1712,6 +1716,9 @@ class LoopVectorizationCostModel {
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
+  /// Calculate cost of SWAR instruction.
+  InstructionCost getSWARInstructionCost(Instruction *I, unsigned VF);
+
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
@@ -1921,6 +1928,9 @@ class LoopVectorizationCostModel {
 
   /// All element types found in the loop.
   SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+  /// Use SWAR vectorization mode.
+  const bool UseSWAR;
 };
 } // end namespace llvm
 
@@ -5071,9 +5081,11 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
     ElementCount MaxSafeVF, bool FoldTailByMasking) {
   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
-  const TypeSize WidestRegister = TTI.getRegisterBitWidth(
+  const TargetTransformInfo::RegisterKind RegKind =
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
-                           : TargetTransformInfo::RGK_FixedWidthVector);
+      : UseSWAR            ? TargetTransformInfo::RGK_Scalar
+                           : TargetTransformInfo::RGK_FixedWidthVector;
+  const TypeSize WidestRegister = TTI.getRegisterBitWidth(RegKind);
 
   // Convenience function to return the minimum of two ElementCounts.
   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
@@ -5128,9 +5140,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
   }
 
-  TargetTransformInfo::RegisterKind RegKind =
-      ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
-                           : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
@@ -6684,6 +6693,65 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   return getWideningCost(I, VF);
 }
 
+InstructionCost
+LoopVectorizationCostModel::getSWARInstructionCost(Instruction *I,
+                                                   unsigned VF) {
+  uint64_t RegSize =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedValue();
+  auto *RegType = IntegerType::get(I->getModule()->getContext(), RegSize);
+  auto GetMultiplier = [&](IntegerType *Ty) -> uint64_t {
+    return divideCeil(Ty->getBitWidth() * VF, RegSize);
+  };
+  if (isa<LoadInst, StoreInst>(I)) {
+    if (getWideningDecision(I, ElementCount::getFixed(VF)) !=
+        LoopVectorizationCostModel::CM_Widen)
+      return InstructionCost::getInvalid();
+    auto *ValTy = dyn_cast<IntegerType>(getLoadStoreType(I));
+    if (!ValTy)
+      return InstructionCost::getInvalid();
+    const auto &DL = I->getModule()->getDataLayout();
+    const Align Alignment = DL.getPrefTypeAlign(RegType);
+    unsigned AddressSpace =
+        getLoadStorePointerOperand(I)->getType()->getPointerAddressSpace();
+    return GetMultiplier(ValTy) * TTI.getMemoryOpCost(I->getOpcode(), RegType,
+                                                      Alignment, AddressSpace);
+  }
+  auto *ValTy = dyn_cast<IntegerType>(I->getType());
+  if (!ValTy)
+    return InstructionCost::getInvalid();
+  if (auto *PN = dyn_cast<PHINode>(I))
+    if (Legal->isReductionVariable(PN))
+      return TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+  auto Multiplier = GetMultiplier(ValTy);
+  if (I->isBitwiseLogicOp())
+    return Multiplier * TTI.getArithmeticInstrCost(I->getOpcode(), RegType);
+  switch (I->getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::LShr:
+    // Shl: (LHS << ShiftAmnt) & Mask
+    // LShr: (LHS >> ShiftAmnt) & Mask
+    if (!isa<ConstantInt>(I->getOperand(1)))
+      return InstructionCost::getInvalid();
+    return Multiplier * (TTI.getArithmeticInstrCost(I->getOpcode(), RegType) +
+                         TTI.getArithmeticInstrCost(Instruction::And, RegType));
+  case Instruction::Add:
+    // Add: ((LHS & ~Mask) + (RHS & ~Mask)) ^ ((LHS ^  RHS) & Mask)
+    return Multiplier *
+           (TTI.getArithmeticInstrCost(Instruction::Add, RegType) +
+            2 * TTI.getArithmeticInstrCost(Instruction::Xor, RegType) +
+            3 * TTI.getArithmeticInstrCost(Instruction::And, RegType));
+  case Instruction::Sub:
+    // Sub: ((LHS |  Mask) - (RHS & ~Mask)) ^ ((LHS ^ ~RHS) & Mask)
+    return Multiplier *
+           (TTI.getArithmeticInstrCost(Instruction::Sub, RegType) +
+            TTI.getArithmeticInstrCost(Instruction::Or, RegType) +
+            2 * TTI.getArithmeticInstrCost(Instruction::And, RegType) +
+            3 * TTI.getArithmeticInstrCost(Instruction::Xor, RegType));
+  default:
+    return InstructionCost::getInvalid();
+  }
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
@@ -6706,6 +6774,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
           false);
   }
 
+  if (UseSWAR && VF.isVector()) {
+    assert(!VF.isScalable() && "Scalable VF not supported");
+    if (!I->isTerminator())
+      return VectorizationCostTy(getSWARInstructionCost(I, VF.getFixedValue()),
+                                 true);
+  }
+
   Type *VectorTy;
   InstructionCost C = getInstructionCost(I, VF, VectorTy);
 
@@ -8208,6 +8283,23 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
   return BlockMaskCache[BB] = BlockMask;
 }
 
+VPRecipeBase *VPRecipeBuilder::tryToSWARMemory(Instruction *I,
+                                               ArrayRef<VPValue *> Operands,
+                                               VFRange &Range) {
+  if (Legal->isMaskRequired(I))
+    return nullptr;
+  if (CM.getWideningDecision(I, Range.Start) !=
+      LoopVectorizationCostModel::CM_Widen)
+    return nullptr;
+  if (!isa<IntegerType>(getLoadStoreType(I)))
+    return nullptr;
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return new VPSWARMemoryInstructionRecipe(*LI, Operands[0]);
+  if (auto *SI = dyn_cast<StoreInst>(I))
+    return new VPSWARMemoryInstructionRecipe(*SI, Operands[1], Operands[0]);
+  llvm_unreachable("Unhandled instruction!");
+}
+
 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
                                                 ArrayRef<VPValue *> Operands,
                                                 VFRange &Range,
@@ -8474,6 +8566,25 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
                                                              Range);
 }
 
+VPRecipeBase *VPRecipeBuilder::tryToSWAR(Instruction *I,
+                                         ArrayRef<VPValue *> Operands) {
+  switch (I->getOpcode()) {
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+    return new VPSWARRecipe(*I, make_range(Operands.begin(), Operands.end()));
+  case Instruction::Shl:
+  case Instruction::LShr:
+    if (!isa<ConstantInt>(I->getOperand(1)))
+      return nullptr;
+    return new VPSWARRecipe(*I, make_range(Operands.begin(), Operands.end()));
+  default:
+    return nullptr;
+  }
+}
+
 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
                                           ArrayRef<VPValue *> Operands,
                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
@@ -8656,7 +8767,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
 
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
-    return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
+    return toVPRecipeResult(
+        CM.UseSWAR ? tryToSWARMemory(Instr, Operands, Range)
+                   : tryToWidenMemory(Instr, Operands, Range, Plan));
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
@@ -8675,7 +8788,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
         new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));
   }
 
-  return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
+  return toVPRecipeResult(CM.UseSWAR ? tryToSWAR(Instr, Operands)
+                                     : tryToWiden(Instr, Operands, VPBB, Plan));
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9117,7 +9231,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
                  "must be a select recipe");
           IndexOfFirstOperand = 1;
         } else {
-          assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
+          assert((MinVF.isScalar() || CM.UseSWAR ||
+                  isa<VPWidenRecipe>(CurrentLink)) &&
                  "Expected to replace a VPWidenSC");
           IndexOfFirstOperand = 0;
         }
@@ -9454,6 +9569,49 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
 }
 
+static Type *getSWARType(Type *ScalarTy, ElementCount VF) {
+  assert(isa<IntegerType>(ScalarTy));
+  unsigned ScalarBitWidth = cast<IntegerType>(ScalarTy)->getBitWidth();
+  assert(!VF.isScalable() && "Scalable VF not supported");
+  return IntegerType::get(ScalarTy->getContext(),
+                          ScalarBitWidth * VF.getFixedValue());
+}
+
+void VPSWARMemoryInstructionRecipe::execute(VPTransformState &State) {
+  auto VF = State.VF;
+  Value *Ptr = State.get(getAddr(), VPIteration(0, 0));
+  bool InBounds = false;
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+    InBounds = GEP->isInBounds();
+  Type *ScalarTy = getLoadStoreType(&Ingredient);
+  Type *SWARTy = getSWARType(ScalarTy, VF);
+  Type *VecTy = VectorType::get(ScalarTy, VF);
+  const auto &DL = Ingredient.getModule()->getDataLayout();
+  const Align Alignment = DL.getPrefTypeAlign(SWARTy);
+
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(Ingredient.getDebugLoc());
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *GEP = Builder.CreateGEP(ScalarTy, Ptr,
+                                   Builder.getInt32(VF.getFixedValue() * Part),
+                                   Ptr->getName() + ".swar", InBounds);
+    Value *SWARPtr = Builder.CreateBitCast(
+        GEP, SWARTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()));
+    Instruction *Res = nullptr;
+    if (isa<LoadInst>(Ingredient)) {
+      Res = Builder.CreateAlignedLoad(SWARTy, SWARPtr, Alignment,
+                                      Ingredient.getName() + ".swar");
+      State.set(getVPSingleValue(), Builder.CreateBitCast(Res, VecTy), Part);
+    } else if (isa<StoreInst>(Ingredient))
+      Res = Builder.CreateAlignedStore(
+          Builder.CreateBitCast(State.get(getStoredValue(), Part), SWARTy),
+          SWARPtr, Alignment);
+    else
+      llvm_unreachable("Unhandled instruction!");
+    State.addMetadata(Res, &Ingredient);
+  }
+}
+
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
 
@@ -9643,7 +9801,7 @@ static bool processLoopInVPlanNativePath(
     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
-    LoopVectorizationRequirements &Requirements) {
+    LoopVectorizationRequirements &Requirements, bool UseSWAR) {
 
   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
@@ -9657,7 +9815,7 @@ static bool processLoopInVPlanNativePath(
       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+                                &Hints, IAI, UseSWAR);
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
@@ -9841,6 +9999,45 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   return true;
 }
 
+static const SCEVPredicate *getAlignPredicate(ScalarEvolution *SE,
+                                              const DataLayout &DL,
+                                              const SCEV *Start,
+                                              Align Alignment) {
+  Type *IntTy = DL.getIntPtrType(Start->getType());
+  const SCEV *Rem = SE->getURemExpr(SE->getPtrToIntExpr(Start, IntTy),
+                                    SE->getConstant(IntTy, Alignment.value()));
+  if (Rem->isZero())
+    return nullptr;
+  return SE->getEqualPredicate(Rem, SE->getZero(IntTy));
+}
+
+static void generateAlignChecks(PredicatedScalarEvolution &PSE,
+                                const VPlan &Plan, ElementCount VF) {
+  ScalarEvolution *SE = PSE.getSE();
+  const DataLayout &DL = SE->getDataLayout();
+  MapVector<const SCEV *, Align> Checks;
+  for (const auto *VPBlock : vp_depth_first_shallow(Plan.getEntry()))
+    for (const auto &Recipe : *VPBlock->getEntryBasicBlock()) {
+      auto *SWARRecipe = dyn_cast<VPSWARMemoryInstructionRecipe>(&Recipe);
+      if (!SWARRecipe)
+        continue;
+      auto &MemInst = SWARRecipe->getIngredient();
+      const SCEVAddRecExpr *PtrSCEV =
+          PSE.getAsAddRec(getLoadStorePointerOperand(&MemInst));
+      assert(PtrSCEV && "Consecutive Ptr expected");
+      const SCEV *Start = PtrSCEV->getStart();
+      Type *SWARTy = getSWARType(getLoadStoreType(&MemInst), VF);
+      Align Alignment = DL.getPrefTypeAlign(SWARTy);
+      if (auto It = Checks.find(Start); It != Checks.end())
+        It->second = std::max(It->second, Alignment);
+      else
+        Checks.insert({Start, Alignment});
+    }
+  for (auto [Start, Alignment] : Checks)
+    if (auto *Predicate = getAlignPredicate(SE, DL, Start, Alignment))
+      PSE.addPredicate(*Predicate);
+}
+
 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
                                !EnableLoopInterleaving),
@@ -9905,9 +10102,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // even evaluating whether vectorization is profitable. Since we cannot modify
   // the incoming IR, we need to build VPlan upfront in the vectorization
   // pipeline.
+  bool UseSWAR =
+      !TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+      EnableSWARVectorization;
   if (!L->isInnermost())
     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
-                                        ORE, BFI, PSI, Hints, Requirements);
+                                        ORE, BFI, PSI, Hints, Requirements,
+                                        UseSWAR);
 
   assert(L->isInnermost() && "Inner loop expected.");
 
@@ -10001,7 +10202,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Use the cost model.
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
-                                F, &Hints, IAI);
+                                F, &Hints, IAI, UseSWAR);
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
                                ORE);
@@ -10026,8 +10227,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     unsigned SelectedIC = std::max(IC, UserIC);
     //  Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
-    if (VF.Width.isVector() || SelectedIC > 1)
+    if (VF.Width.isVector() || SelectedIC > 1) {
+      if (UseSWAR)
+        generateAlignChecks(PSE, LVP.getBestPlanFor(VF.Width), VF.Width);
       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+    }
 
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
@@ -10299,12 +10503,14 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
 
   // Don't attempt if
   // 1. the target claims to have no vector registers, and
-  // 2. interleaving won't help ILP.
+  // 2. SWAR vectorization is disabled, and
+  // 3. interleaving won't help ILP.
   //
-  // The second condition is necessary because, even if the target has no
+  // The last condition is necessary because, even if the target has no
   // vector registers, loop vectorization may still enable scalar
   // interleaving.
   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+      !EnableSWARVectorization &&
       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
     return LoopVectorizeResult(false, false);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 7ff6749a09089e9..7369de4320cddd8 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -103,6 +103,10 @@ class VPRecipeBuilder {
   VPRecipeBase *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
                            VPBasicBlock *VPBB, VPlanPtr &Plan);
 
+  VPRecipeBase *tryToSWARMemory(Instruction *I, ArrayRef<VPValue *> Operands,
+                                VFRange &Range);
+  VPRecipeBase *tryToSWAR(Instruction *I, ArrayRef<VPValue *> Operands);
+
   /// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.
   VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e65a7ab2cd028ee..4c29e843401e82c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1148,6 +1148,32 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
   }
 };
 
+// VPSWARRecipe is a recipe for producing a SIMD-within-a-register (SWAR)
+// operation for its ingredient. The operation works on values that are packed
+// into scalar registers. This recipe covers the following cases:
+// 1. Bitwise operations (and, or, xor)
+// 2. Shifts (shl, lshr) with constant second operand
+// 3. Add/Sub operations.
+class VPSWARRecipe : public VPRecipeBase, public VPValue {
+public:
+  template <typename IterT>
+  VPSWARRecipe(Instruction &I, iterator_range<IterT> Operands)
+      : VPRecipeBase(VPRecipeBase::VPSWARSC, Operands), VPValue(this, &I) {}
+
+  ~VPSWARRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPSWARSC)
+
+  /// Generate the SWAR operation.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// VPWidenRecipe is a recipe for producing a copy of vector type its
 /// ingredient. This recipe covers most of the traditional vectorization cases
 /// where each ingredient transforms into a vectorized version of itself.
@@ -1929,6 +1955,52 @@ class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
   }
 };
 
+// A recipe for SWAR (SIMD-wthin-a-register) load/store operations.
+class VPSWARMemoryInstructionRecipe : public VPRecipeBase {
+  Instruction &Ingredient;
+
+public:
+  VPSWARMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr)
+      : VPRecipeBase(VPSWARMemoryInstructionSC, {Addr}), Ingredient(Load) {
+    new VPValue(this, &Load);
+  }
+
+  VPSWARMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
+                                VPValue *StoredValue)
+      : VPRecipeBase(VPSWARMemoryInstructionSC, {Addr, StoredValue}),
+        Ingredient(Store) {}
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPSWARMemoryInstructionSC;
+  }
+
+  /// Return the address accessed by this recipe.
+  VPValue *getAddr() const {
+    return getOperand(0); // Address is the 1st, mandatory operand.
+  }
+
+  /// Returns true if this recipe is a store.
+  bool isStore() const { return isa<StoreInst>(Ingredient); }
+
+  /// Return the address accessed by this recipe.
+  VPValue *getStoredValue() const {
+    assert(isStore() && "Stored value only available for store instructions");
+    return getOperand(1); // Stored value is the 2nd, mandatory operand.
+  }
+
+  /// Generate the SWAR load/store.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  Instruction &getIngredient() const { return Ingredient; }
+};
+
 /// A Recipe for widening load/store operations.
 /// The recipe uses the following VPValues:
 /// - For load: Address, optional mask
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a1213a98095907..767feef39de4cc9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -43,6 +43,9 @@ extern cl::opt<bool> EnableVPlanNativePath;
 
 bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
+  case VPSWARMemoryInstructionSC: {
+    return cast<VPSWARMemoryInstructionRecipe>(this)->isStore();
+  }
   case VPWidenMemoryInstructionSC: {
     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
   }
@@ -56,6 +59,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
     return false;
   case VPBlendSC:
   case VPReductionSC:
+  case VPSWARSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
@@ -77,6 +81,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
 
 bool VPRecipeBase::mayReadFromMemory() const {
   switch (getVPDefID()) {
+  case VPSWARMemoryInstructionSC: {
+    return !cast<VPSWARMemoryInstructionRecipe>(this)->isStore();
+  }
   case VPWidenMemoryInstructionSC: {
     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
   }
@@ -90,6 +97,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
     return false;
   case VPBlendSC:
   case VPReductionSC:
+  case VPSWARSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
@@ -130,6 +138,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
         ->mayHaveSideEffects();
   case VPBlendSC:
   case VPReductionSC:
+  case VPSWARSC:
   case VPScalarIVStepsSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
@@ -146,6 +155,13 @@ bool VPRecipeBase::mayHaveSideEffects() const {
            "underlying instruction has side-effects");
     return false;
   }
+  case VPSWARMemoryInstructionSC:
+    assert(cast<VPSWARMemoryInstructionRecipe>(this)
+                   ->getIngredient()
+                   .mayHaveSideEffects() == mayWriteToMemory() &&
+           "mayHaveSideffects result for ingredient differs from this "
+           "implementation");
+    return mayWriteToMemory();
   case VPWidenMemoryInstructionSC:
     assert(cast<VPWidenMemoryInstructionRecipe>(this)
                    ->getIngredient()
@@ -496,6 +512,112 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+static Value *SWARBinOp(IRBuilderBase &B, BinaryOperator *BinOp, Value *LHS,
+                        Value *RHS, unsigned ScalarBitWidth,
+                        unsigned SWARBitWidth) {
+  auto Opc = BinOp->getOpcode();
+  Twine Name = BinOp->getName() + ".swar";
+  switch (Opc) {
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return B.CreateBinOp(Opc, LHS, RHS, Name);
+  case Instruction::Shl:
+  case Instruction::LShr: {
+    // Shl: (LHS << ShiftAmnt) & Mask
+    // LShr: (LHS >> ShiftAmnt) & Mask
+    // Mask: splat of scalar all-ones value shifted on ShiftAmnt, e.g. for
+    // bytewise operations:
+    // Shl on 2: 0xfcfcfcfcfcfcfcfc
+    // LShr on 3: 0x1f1f1f1f1f1f1f1f
+    assert(isa<ConstantExpr>(RHS));
+    Value *ShiftAmntSplat = cast<ConstantExpr>(RHS)->getOperand(0);
+    assert(isa<Constant>(ShiftAmntSplat));
+    unsigned ShiftAmnt =
+        cast<Constant>(ShiftAmntSplat)->getUniqueInteger().getLimitedValue();
+    auto Mask = APInt::getAllOnes(ScalarBitWidth);
+    if (Opc == Instruction::Shl)
+      Mask = Mask.shl(ShiftAmnt);
+    else
+      Mask = Mask.lshr(ShiftAmnt);
+    Value *Res = Opc == Instruction::Shl ? B.CreateShl(LHS, ShiftAmnt, Name)
+                                         : B.CreateLShr(LHS, ShiftAmnt, Name);
+    return B.CreateAnd(Res, B.getInt(APInt::getSplat(SWARBitWidth, Mask)));
+  }
+  case Instruction::Add:
+  case Instruction::Sub: {
+    // Add: ((LHS & ~Mask) + (RHS & ~Mask)) ^ ((LHS ^  RHS) & Mask)
+    // Sub: ((LHS |  Mask) - (RHS & ~Mask)) ^ ((LHS ^ ~RHS) & Mask)
+    // Mask: MSB set in each element, e.g. for bytewise math in 64-bit register:
+    // Mask = 0x8080808080808080
+    auto MaskVal =
+        APInt::getSplat(SWARBitWidth, APInt::getSignMask(ScalarBitWidth));
+    Value *Mask = B.getInt(MaskVal);
+    Value *InvMask = B.CreateNot(Mask);
+    Value *Res = Opc == Instruction::Add
+                     ? B.CreateAdd(B.CreateAnd(LHS, InvMask),
+                                   B.CreateAnd(RHS, InvMask), Name)
+                     : B.CreateSub(B.CreateOr(LHS, Mask),
+                                   B.CreateAnd(RHS, InvMask), Name);
+    return B.CreateXor(
+        Res,
+        B.CreateAnd(
+            B.CreateXor(LHS, Opc == Instruction::Add ? RHS : B.CreateNot(RHS)),
+            Mask));
+  }
+  default:
+    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << *BinOp);
+    llvm_unreachable("Unhandled instruction!");
+  }
+}
+
+void VPSWARRecipe::execute(VPTransformState &State) {
+  assert(isa<BinaryOperator>(getUnderlyingValue()));
+  auto *BinOp = cast<BinaryOperator>(getUnderlyingValue());
+  Type *ScalarTy = BinOp->getType();
+  assert(isa<IntegerType>(ScalarTy));
+  unsigned ScalarBitWidth = cast<IntegerType>(ScalarTy)->getBitWidth();
+  assert(!State.VF.isScalable() && "Scalable VF not supported");
+  unsigned VF = State.VF.getFixedValue();
+  Type *SWARTy = IntegerType::get(ScalarTy->getContext(), ScalarBitWidth * VF);
+  Type *VecTy = FixedVectorType::get(ScalarTy, VF);
+
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(BinOp->getDebugLoc());
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *LHS = Builder.CreateBitCast(State.get(getOperand(0), Part), SWARTy);
+    Value *RHS = Builder.CreateBitCast(State.get(getOperand(1), Part), SWARTy);
+    Value *Res = SWARBinOp(Builder, BinOp, LHS, RHS, ScalarBitWidth,
+                           ScalarBitWidth * VF);
+    State.set(this, Builder.CreateBitCast(Res, VecTy), Part);
+    State.addMetadata(Res, BinOp);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPSWARRecipe::print(raw_ostream &O, const Twine &Indent,
+                         VPSlotTracker &SlotTracker) const {
+  O << Indent << "SWAR ";
+  printAsOperand(O, SlotTracker);
+  const Instruction *UI = getUnderlyingInstr();
+  O << " = " << UI->getOpcodeName() << " ";
+  printOperands(O, SlotTracker);
+}
+
+void VPSWARMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                          VPSlotTracker &SlotTracker) const {
+  O << Indent << "SWAR ";
+
+  if (!isStore()) {
+    getVPSingleValue()->printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+  O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
+
+  printOperands(O, SlotTracker);
+}
+#endif
+
 void VPWidenCallRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
   auto &CI = *cast<CallInst>(getUnderlyingInstr());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index ac110bb3b0ef9be..1a82dfcd40d5c1a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -50,6 +50,7 @@ class VPValue {
   friend class VPInterleavedAccessInfo;
   friend class VPSlotTracker;
   friend class VPRecipeBase;
+  friend class VPSWARMemoryInstructionRecipe;
   friend class VPWidenMemoryInstructionRecipe;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -342,6 +343,8 @@ class VPDef {
     VPInterleaveSC,
     VPReductionSC,
     VPReplicateSC,
+    VPSWARSC,
+    VPSWARMemoryInstructionSC,
     VPScalarIVStepsSC,
     VPWidenCallSC,
     VPWidenCanonicalIVSC,
diff --git a/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll b/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
index d3640dfe5d439d2..857e525b839c68a 100644
--- a/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/swar-vectorization.ll
@@ -1,29 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -S 2>%t | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -enable-swar-vectorization -mtriple riscv64 -S 2>%t | FileCheck %s
 
 ; Tests for SWAR (SIMD within a register) vectorization
 
 define void @test1(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 24
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC4]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR6:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT:    store i64 [[TMP10]], ptr [[DOTSWAR6]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP12]], ptr [[ARRAYIDX2]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %cmp6 = icmp sgt i32 %N, 0
@@ -53,26 +94,83 @@ for.body:
 define void @test2(ptr writeonly %dst1, ptr writeonly %dst2, ptr readonly %src, i32 signext %N) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC7:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[DST16:%.*]] = ptrtoint ptr [[DST1:%.*]] to i64
+; CHECK-NEXT:    [[DST24:%.*]] = ptrtoint ptr [[DST2:%.*]] to i64
+; CHECK-NEXT:    [[DST12:%.*]] = ptrtoint ptr [[DST1]] to i64
+; CHECK-NEXT:    [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP13]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[DST12]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[DST24]] to i3
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i3 [[TMP4]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK5:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK5]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[DST24]], [[DST16]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[DST16]], [[SRC7]]
+; CHECK-NEXT:    [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 8
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[DST24]], [[SRC7]]
+; CHECK-NEXT:    [[DIFF_CHECK9:%.*]] = icmp ult i64 [[TMP10]], 8
+; CHECK-NEXT:    [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK9]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX10]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP11]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[DOTSWAR11:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64 [[DOTSWAR11]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[DOTSWAR12:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
+; CHECK-NEXT:    store i64 [[TMP15]], ptr [[DOTSWAR12]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[DST2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[DOTSWAR13:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
+; CHECK-NEXT:    store i64 [[TMP17]], ptr [[DOTSWAR13]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[TMP0]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP19]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP19]], ptr [[ARRAYIDX6]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
   %cmp13 = icmp sgt i32 %N, 0
@@ -104,25 +202,69 @@ for.body:
 define void @test3(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC4]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT:    [[DOTSWAR6:%.*]] = and i64 [[TMP9]], bitcast (<8 x i8> <i8 66, i8 66, i8 66, i8 66, i8 66, i8 66, i8 66, i8 66> to i64)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[DOTSWAR6]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR7:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP10]] to i64
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[DOTSWAR7]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[TMP0]], 66
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = and i8 [[TMP14]], 66
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP15]], ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   %cmp7 = icmp sgt i32 %N, 0
@@ -153,25 +295,72 @@ for.body:
 define void @test4(ptr writeonly %dst, ptr readonly %src1, i8 zeroext %src2, i32 noundef signext %N) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC14:%.*]] = ptrtoint ptr [[SRC1:%.*]] to i64
+; CHECK-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC11:%.*]] = ptrtoint ptr [[SRC1]] to i64
 ; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC11]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC14]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[SRC2:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[BROADCAST_SPLAT]] to i64
+; CHECK-NEXT:    [[OR8_SWAR:%.*]] = or i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[OR8_SWAR]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR6:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[DOTSWAR6]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[OR8:%.*]] = or i8 [[TMP0]], [[SRC2:%.*]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[OR8:%.*]] = or i8 [[TMP15]], [[SRC2]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 [[OR8]], ptr [[ARRAYIDX4]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
@@ -202,27 +391,85 @@ for.body:
 define void @test5(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC27:%.*]] = ptrtoint ptr [[SRC2:%.*]] to i64
+; CHECK-NEXT:    [[SRC16:%.*]] = ptrtoint ptr [[SRC1:%.*]] to i64
+; CHECK-NEXT:    [[DST4:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC22:%.*]] = ptrtoint ptr [[SRC2]] to i64
+; CHECK-NEXT:    [[SRC11:%.*]] = ptrtoint ptr [[SRC1]] to i64
 ; CHECK-NEXT:    [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 24
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC11]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[SRC22]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[DST4]] to i3
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i3 [[TMP4]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK5:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK5]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[DST4]], [[SRC16]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[DST4]], [[SRC27]]
+; CHECK-NEXT:    [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 8
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK8]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[DOTSWAR9:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[DOTSWAR9]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR10:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[DOTSWAR11:%.*]] = load i64, ptr [[DOTSWAR10]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[DOTSWAR11]] to <8 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
+; CHECK-NEXT:    [[XOR11_SWAR:%.*]] = xor i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64 [[XOR11_SWAR]] to <8 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR12:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP17]] to i64
+; CHECK-NEXT:    store i64 [[TMP19]], ptr [[DOTSWAR12]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[XOR11:%.*]] = xor i8 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[XOR11:%.*]] = xor i8 [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 [[XOR11]], ptr [[ARRAYIDX6]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 entry:
   %cmp12 = icmp sgt i32 %N, 0
@@ -255,25 +502,70 @@ for.body:
 define void @test6(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC4]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT:    [[SHL_SWAR:%.*]] = shl i64 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[SHL_SWAR]], -72340172838076674
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR6:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[DOTSWAR6]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 [[SHL]], ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 entry:
   %cmp7 = icmp sgt i32 %N, 0
@@ -304,25 +596,70 @@ for.body:
 define void @test7(ptr writeonly %dst, ptr readonly %src, i32 signext %N) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC1:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[DST2]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[DST2]], [[SRC4]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], 8
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSWAR5:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[DOTSWAR5]] to <8 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT:    [[DOTSWAR6:%.*]] = lshr i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[DOTSWAR6]], 4557430888798830399
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[DOTSWAR7:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[DOTSWAR7]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 [[TMP0]], 2
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i8 [[TMP15]], 2
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP16]], ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ;
 entry:
   %cmp7 = icmp sgt i32 %N, 0
@@ -353,27 +690,90 @@ for.body:
 define void @test8(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC27:%.*]] = ptrtoint ptr [[SRC2:%.*]] to i64
+; CHECK-NEXT:    [[SRC16:%.*]] = ptrtoint ptr [[SRC1:%.*]] to i64
+; CHECK-NEXT:    [[DST4:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC22:%.*]] = ptrtoint ptr [[SRC2]] to i64
+; CHECK-NEXT:    [[SRC11:%.*]] = ptrtoint ptr [[SRC1]] to i64
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 24
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC11]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[SRC22]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[DST4]] to i3
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i3 [[TMP4]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK5:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK5]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[DST4]], [[SRC16]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[DST4]], [[SRC27]]
+; CHECK-NEXT:    [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 8
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK8]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[DOTSWAR9:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[DOTSWAR9]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR10:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[DOTSWAR11:%.*]] = load i64, ptr [[DOTSWAR10]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[DOTSWAR11]] to <8 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP15]], 9187201950435737471
+; CHECK-NEXT:    [[TMP18:%.*]] = and i64 [[TMP16]], 9187201950435737471
+; CHECK-NEXT:    [[ADD_SWAR:%.*]] = add i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and i64 [[TMP19]], -9187201950435737472
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[ADD_SWAR]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i64 [[TMP21]] to <8 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR12:%.*]] = getelementptr inbounds i8, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i8> [[TMP22]] to i64
+; CHECK-NEXT:    store i64 [[TMP24]], ptr [[DOTSWAR12]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 entry:
   %cmp11 = icmp sgt i32 %N, 0
@@ -406,27 +806,91 @@ for.body:
 define void @test9(ptr writeonly %dst, ptr readonly %src1, ptr readonly %src2, i32 signext %N) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC27:%.*]] = ptrtoint ptr [[SRC2:%.*]] to i64
+; CHECK-NEXT:    [[SRC16:%.*]] = ptrtoint ptr [[SRC1:%.*]] to i64
+; CHECK-NEXT:    [[DST4:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC22:%.*]] = ptrtoint ptr [[SRC2]] to i64
+; CHECK-NEXT:    [[SRC11:%.*]] = ptrtoint ptr [[SRC1]] to i64
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 24
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC11]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[SRC22]] to i3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK3:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[DST4]] to i3
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i3 [[TMP4]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK5:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK5]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[DST4]], [[SRC16]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[DST4]], [[SRC27]]
+; CHECK-NEXT:    [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 8
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK8]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[DOTSWAR9:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[DOTSWAR9]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR10:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[DOTSWAR11:%.*]] = load i64, ptr [[DOTSWAR10]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[DOTSWAR11]] to <8 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP15]], -9187201950435737472
+; CHECK-NEXT:    [[TMP18:%.*]] = and i64 [[TMP16]], 9187201950435737471
+; CHECK-NEXT:    [[SUB_SWAR:%.*]] = sub i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP16]], -1
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP15]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], -9187201950435737472
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[SUB_SWAR]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i64 [[TMP22]] to <8 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[DOTSWAR12:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i8> [[TMP23]] to i64
+; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DOTSWAR12]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 [[SUB]], ptr [[ARRAYIDX6]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 entry:
   %cmp11 = icmp sgt i32 %N, 0
@@ -459,26 +923,60 @@ for.body:
 define zeroext i8 @test_reduction_or(ptr readonly %src, i32 signext %N) {
 ; CHECK-LABEL: @test_reduction_or(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC1:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
 ; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSWAR2:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[DOTSWAR2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[VEC_PHI]] to i64
+; CHECK-NEXT:    [[OR_SWAR:%.*]] = or i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7]] = bitcast i64 [[OR_SWAR]] to <8 x i8>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i8 [ [[OR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i8 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i8 [[RES_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[RES:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OR]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[OR]] = or i8 [[TMP0]], [[RES]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[OR]] = or i8 [[TMP10]], [[RES]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ;
 entry:
   %cmp7 = icmp sgt i32 %N, 0
@@ -510,26 +1008,60 @@ for.body:
 define zeroext i8 @test_reduction_add(ptr readonly %src, i32 signext %N) {
 ; CHECK-LABEL: @test_reduction_add(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC1:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
 ; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[SRC1]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[TMP0]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[DOTSWAR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSWAR2:%.*]] = load i64, ptr [[DOTSWAR]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[DOTSWAR2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[VEC_PHI]] to i64
+; CHECK-NEXT:    [[ADD_SWAR:%.*]] = or i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7]] = bitcast i64 [[ADD_SWAR]] to <8 x i8>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i8 [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i8 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i8 [[RES_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[RES:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ADD]] = or i8 [[TMP0]], [[RES]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ADD]] = or i8 [[TMP10]], [[RES]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 entry:
   %cmp7 = icmp sgt i32 %N, 0