[llvm] [LSR] Recognize vscale-relative immediates (PR #88124)

Fri Jun 21 04:04:23 PDT 2024

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/88124

>From 3a04cf771108e270105dfc2a7b4b1d7dc7b078c1 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 14 May 2024 13:08:52 +0000
Subject: [PATCH 1/4] Test cases for vscale immediates

---
 .../AArch64/vscale-fixups.ll                  | 344 ++++++++++++++++++
 1 file changed, 344 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll

diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
new file mode 100644
index 0000000000000..367806ac438b0
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -0,0 +1,344 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc %s -o - | FileCheck %s --check-prefixes=COMMON,BASE
+;; Additional runlines to exercise lsr code which AArch64 normally wouldn't.
+; RUN: llc %s -o - -lsr-preferred-addressing-mode=preindexed | FileCheck %s --check-prefixes=COMMON,PREINDEX
+; RUN: llc %s -o - -lsr-preferred-addressing-mode=postindexed | FileCheck %s --check-prefixes=COMMON,POSTINDEX
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 {
+; COMMON-LABEL: mulvl123_addressing:
+; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    ptrue p0.b
+; COMMON-NEXT:    mov x8, xzr
+; COMMON-NEXT:  .LBB0_1: // %for.body
+; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; COMMON-NEXT:    ld1b { z1.b }, p0/z, [x0, #1, mul vl]
+; COMMON-NEXT:    ld1b { z2.b }, p0/z, [x0, #2, mul vl]
+; COMMON-NEXT:    ld1b { z3.b }, p0/z, [x0, #3, mul vl]
+; COMMON-NEXT:    addvl x0, x0, #5
+; COMMON-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; COMMON-NEXT:    movprfx z1, z2
+; COMMON-NEXT:    umax z1.b, p0/m, z1.b, z3.b
+; COMMON-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; COMMON-NEXT:    st1b { z0.b }, p0, [x1, x8]
+; COMMON-NEXT:    addvl x8, x8, #1
+; COMMON-NEXT:    cmp x8, x2
+; COMMON-NEXT:    b.lo .LBB0_1
+; COMMON-NEXT:  // %bb.2: // %for.exit
+; COMMON-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %vscale, 4
+  %mul = shl nuw nsw i64 %vscale, 6
+  br label %for.body
+
+for.body:
+  %src.addr = phi ptr [ %src, %entry ], [ %src.addr.next, %for.body ]
+  %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src.addr, i64 %idx
+  %3 = load <vscale x 16 x i8>, ptr %arrayidx
+  %4 = getelementptr <vscale x 16 x i8>, ptr %arrayidx, i64 1
+  %5 = load <vscale x 16 x i8>, ptr %4
+  %6 = getelementptr <vscale x 16 x i8>, ptr %arrayidx, i64 2
+  %7 = load <vscale x 16 x i8>, ptr %6
+  %8 = getelementptr <vscale x 16 x i8>, ptr %arrayidx, i64 3
+  %9 = load <vscale x 16 x i8>, ptr %8
+  %10 = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %3, <vscale x 16 x i8> %5)
+  %11 = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %7, <vscale x 16 x i8> %9)
+  %12 = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %10, <vscale x 16 x i8> %11)
+  %src.addr.next = getelementptr inbounds i8, ptr %src.addr, i64 %mul
+  %arrayidx4 = getelementptr inbounds i8, ptr %dst, i64 %idx
+  store <vscale x 16 x i8> %12, ptr %arrayidx4
+  %idx.next = add i64 %idx, %2
+  %cmp = icmp ult i64 %idx.next, %count
+  br i1 %cmp, label %for.body, label %for.exit
+
+for.exit:
+  ret void
+}
+
+define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i64 %count) #0 {
+; COMMON-LABEL: many_mulvl1_addressing:
+; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    mov x9, x1
+; COMMON-NEXT:    ptrue p0.b
+; COMMON-NEXT:    add x10, x0, x2
+; COMMON-NEXT:    inch x9
+; COMMON-NEXT:    ptrue p1.h
+; COMMON-NEXT:    mov x8, xzr
+; COMMON-NEXT:    addvl x11, x10, #1
+; COMMON-NEXT:    addvl x12, x0, #1
+; COMMON-NEXT:  .LBB1_1: // %for.body
+; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
+; COMMON-NEXT:    ld1b { z1.b }, p0/z, [x10, x8]
+; COMMON-NEXT:    subs x3, x3, #1
+; COMMON-NEXT:    ld1b { z2.b }, p0/z, [x12, x8]
+; COMMON-NEXT:    ld1b { z3.b }, p0/z, [x11, x8]
+; COMMON-NEXT:    add z0.b, z0.b, z1.b
+; COMMON-NEXT:    add z1.b, z2.b, z3.b
+; COMMON-NEXT:    st1b { z0.h }, p1, [x1, x8]
+; COMMON-NEXT:    st1b { z1.h }, p1, [x9, x8]
+; COMMON-NEXT:    addvl x8, x8, #2
+; COMMON-NEXT:    b.ne .LBB1_1
+; COMMON-NEXT:  // %bb.2: // %for.exit
+; COMMON-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %mul = shl nuw nsw i64 %vscale, 5
+  br label %for.body
+
+for.body:
+  %src_row_addr = phi ptr [ %src_rows, %entry ], [ %add_ptr_src, %for.body ]
+  %dst_row_addr = phi ptr [ %dst_rows, %entry ], [ %add_ptr_dst, %for.body ]
+  %idx = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %2 = load <vscale x 16 x i8>, ptr %src_row_addr
+  %3 = getelementptr <vscale x 16 x i8>, ptr %src_row_addr, i64 1
+  %4 = load <vscale x 16 x i8>, ptr %3
+  %arrayidx2 = getelementptr inbounds i8, ptr %src_row_addr, i64 %stride
+  %5 = load <vscale x 16 x i8>, ptr %arrayidx2
+  %6 = getelementptr <vscale x 16 x i8>, ptr %arrayidx2, i64 1
+  %7 = load <vscale x 16 x i8>, ptr %6
+  %8 = add <vscale x 16 x i8> %2, %5
+  %9 = add <vscale x 16 x i8> %4, %7
+  %10 = bitcast <vscale x 16 x i8> %8 to <vscale x 8 x i16>
+  %11 = trunc <vscale x 8 x i16> %10 to <vscale x 8 x i8>
+  store <vscale x 8 x i8> %11, ptr %dst_row_addr
+  %12 = bitcast <vscale x 16 x i8> %9 to <vscale x 8 x i16>
+  %13 = getelementptr <vscale x 8 x i8>, ptr %dst_row_addr, i64 1
+  %14 = trunc <vscale x 8 x i16> %12 to <vscale x 8 x i8>
+  store <vscale x 8 x i8> %14, ptr %13
+  %add_ptr_src = getelementptr inbounds i8, ptr %src_row_addr, i64 %mul
+  %add_ptr_dst = getelementptr inbounds i8, ptr %dst_row_addr, i64 %mul
+  %inc = add nuw i64 %idx, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+define void @fixed_iv_scalable_offset(ptr %src, ptr %dst, i64 %count) #0 {
+; BASE-LABEL: fixed_iv_scalable_offset:
+; BASE:       // %bb.0: // %entry
+; BASE-NEXT:    ptrue p0.s
+; BASE-NEXT:  .LBB2_1: // %for.body
+; BASE-NEXT:    // =>This Inner Loop Header: Depth=1
+; BASE-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; BASE-NEXT:    ld1w { z1.s }, p0/z, [x0, #4, mul vl]
+; BASE-NEXT:    subs x2, x2, #4
+; BASE-NEXT:    add x0, x0, #16
+; BASE-NEXT:    add z0.s, z0.s, z1.s
+; BASE-NEXT:    st1w { z0.s }, p0, [x1]
+; BASE-NEXT:    add x1, x1, #16
+; BASE-NEXT:    b.ne .LBB2_1
+; BASE-NEXT:  // %bb.2: // %for.exit
+; BASE-NEXT:    ret
+;
+; PREINDEX-LABEL: fixed_iv_scalable_offset:
+; PREINDEX:       // %bb.0: // %entry
+; PREINDEX-NEXT:    ptrue p0.s
+; PREINDEX-NEXT:  .LBB2_1: // %for.body
+; PREINDEX-NEXT:    // =>This Inner Loop Header: Depth=1
+; PREINDEX-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; PREINDEX-NEXT:    ld1w { z1.s }, p0/z, [x0, #4, mul vl]
+; PREINDEX-NEXT:    subs x2, x2, #4
+; PREINDEX-NEXT:    add x0, x0, #16
+; PREINDEX-NEXT:    add z0.s, z0.s, z1.s
+; PREINDEX-NEXT:    st1w { z0.s }, p0, [x1]
+; PREINDEX-NEXT:    add x1, x1, #16
+; PREINDEX-NEXT:    b.ne .LBB2_1
+; PREINDEX-NEXT:  // %bb.2: // %for.exit
+; PREINDEX-NEXT:    ret
+;
+; POSTINDEX-LABEL: fixed_iv_scalable_offset:
+; POSTINDEX:       // %bb.0: // %entry
+; POSTINDEX-NEXT:    ptrue p0.s
+; POSTINDEX-NEXT:    addvl x8, x0, #4
+; POSTINDEX-NEXT:  .LBB2_1: // %for.body
+; POSTINDEX-NEXT:    // =>This Inner Loop Header: Depth=1
+; POSTINDEX-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; POSTINDEX-NEXT:    ld1w { z1.s }, p0/z, [x8]
+; POSTINDEX-NEXT:    subs x2, x2, #4
+; POSTINDEX-NEXT:    add x8, x8, #16
+; POSTINDEX-NEXT:    add x0, x0, #16
+; POSTINDEX-NEXT:    add z0.s, z0.s, z1.s
+; POSTINDEX-NEXT:    st1w { z0.s }, p0, [x1]
+; POSTINDEX-NEXT:    add x1, x1, #16
+; POSTINDEX-NEXT:    b.ne .LBB2_1
+; POSTINDEX-NEXT:  // %bb.2: // %for.exit
+; POSTINDEX-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %mul = shl nuw nsw i64 %vscale, 4
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %src.ptr = getelementptr inbounds i32, ptr %src, i64 %iv
+  %data = load <vscale x 4 x i32>, ptr %src.ptr
+  %src.ptr.offset = getelementptr inbounds i32, ptr %src.ptr, i64 %mul
+  %data2 = load <vscale x 4 x i32>, ptr %src.ptr.offset
+  %add = add <vscale x 4 x i32> %data, %data2
+  %dst.ptr = getelementptr i32, ptr %dst, i64 %iv
+  store <vscale x 4 x i32> %add, ptr %dst.ptr
+  %inc = add nuw i64 %iv, 4
+  %exit.cond = icmp eq i64 %inc, %count
+  br i1 %exit.cond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) #0 {
+; COMMON-LABEL: mixed_offsets_scalable_then_fixed:
+; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    ptrue p0.s
+; COMMON-NEXT:    mov x8, xzr
+; COMMON-NEXT:    addvl x9, x0, #4
+; COMMON-NEXT:    mov x10, #8 // =0x8
+; COMMON-NEXT:  .LBB3_1: // %for.body
+; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; COMMON-NEXT:    ld1w { z1.s }, p0/z, [x9, x8, lsl #2]
+; COMMON-NEXT:    add x11, x9, x8, lsl #2
+; COMMON-NEXT:    ld1w { z2.s }, p0/z, [x11, x10, lsl #2]
+; COMMON-NEXT:    add z0.s, z0.s, z1.s
+; COMMON-NEXT:    add z0.s, z0.s, z2.s
+; COMMON-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; COMMON-NEXT:    incw x8
+; COMMON-NEXT:    cmp x2, x8
+; COMMON-NEXT:    b.ne .LBB3_1
+; COMMON-NEXT:  // %bb.2: // %for.exit
+; COMMON-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %mul = shl nuw nsw i64 %vscale, 4
+  %vl = shl nuw nsw i64 %vscale, 2
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %src.ptr = getelementptr inbounds i32, ptr %src, i64 %iv
+  %data = load <vscale x 4 x i32>, ptr %src.ptr
+  %src.ptr.sc_off = getelementptr inbounds i32, ptr %src.ptr, i64 %mul
+  %data2 = load <vscale x 4 x i32>, ptr %src.ptr.sc_off
+  %src.ptr.fx_off = getelementptr inbounds i32, ptr %src.ptr.sc_off, i64 8
+  %data3 = load <vscale x 4 x i32>, ptr %src.ptr.fx_off
+  %add = add <vscale x 4 x i32> %data, %data2
+  %add2 = add <vscale x 4 x i32> %add, %data3
+  %dst.ptr = getelementptr i32, ptr %dst, i64 %iv
+  store <vscale x 4 x i32> %add2, ptr %dst.ptr
+  %inc = add nuw i64 %iv, %vl
+  %exit.cond = icmp eq i64 %inc, %count
+  br i1 %exit.cond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+define void @mixed_offsets_fixed_then_scalable(ptr %src, ptr %dst, i64 %count) #0 {
+; COMMON-LABEL: mixed_offsets_fixed_then_scalable:
+; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    addvl x9, x0, #4
+; COMMON-NEXT:    ptrue p0.s
+; COMMON-NEXT:    mov x8, xzr
+; COMMON-NEXT:    add x9, x9, #32
+; COMMON-NEXT:    mov x10, #8 // =0x8
+; COMMON-NEXT:  .LBB4_1: // %for.body
+; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    add x11, x0, x8, lsl #2
+; COMMON-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; COMMON-NEXT:    ld1w { z2.s }, p0/z, [x9, x8, lsl #2]
+; COMMON-NEXT:    ld1w { z1.s }, p0/z, [x11, x10, lsl #2]
+; COMMON-NEXT:    add z0.s, z0.s, z1.s
+; COMMON-NEXT:    add z0.s, z0.s, z2.s
+; COMMON-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; COMMON-NEXT:    incw x8
+; COMMON-NEXT:    cmp x2, x8
+; COMMON-NEXT:    b.ne .LBB4_1
+; COMMON-NEXT:  // %bb.2: // %for.exit
+; COMMON-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %mul = shl nuw nsw i64 %vscale, 4
+  %vl = shl nuw nsw i64 %vscale, 2
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %src.ptr = getelementptr inbounds i32, ptr %src, i64 %iv
+  %data = load <vscale x 4 x i32>, ptr %src.ptr
+  %src.ptr.fx_off = getelementptr inbounds i32, ptr %src.ptr, i64 8
+  %data2 = load <vscale x 4 x i32>, ptr %src.ptr.fx_off
+  %src.ptr.sc_off = getelementptr inbounds i32, ptr %src.ptr.fx_off, i64 %mul
+  %data3 = load <vscale x 4 x i32>, ptr %src.ptr.sc_off
+  %add = add <vscale x 4 x i32> %data, %data2
+  %add2 = add <vscale x 4 x i32> %add, %data3
+  %dst.ptr = getelementptr i32, ptr %dst, i64 %iv
+  store <vscale x 4 x i32> %add2, ptr %dst.ptr
+  %inc = add nuw i64 %iv, %vl
+  %exit.cond = icmp eq i64 %inc, %count
+  br i1 %exit.cond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+;; FIXME: There's an opportunity here (that we currently miss) to define the phi
+;; on the middle access, and have negative and positive scalable immediates.
+;;
+;; Currently we generate a scalable offset for the load in range of the base,
+;; and a register to store the offset for the access that's out of range of the
+;; base (but in range of the other).
+;;
+define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 {
+; COMMON-LABEL: three_access_wide_gap:
+; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    ptrue p0.s
+; COMMON-NEXT:    mov x8, xzr
+; COMMON-NEXT:    addvl x9, x0, #8
+; COMMON-NEXT:    addvl x10, x0, #4
+; COMMON-NEXT:  .LBB5_1: // %for.body
+; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; COMMON-NEXT:    ld1w { z1.s }, p0/z, [x10, x8, lsl #2]
+; COMMON-NEXT:    ld1w { z2.s }, p0/z, [x9, x8, lsl #2]
+; COMMON-NEXT:    add z0.s, z0.s, z1.s
+; COMMON-NEXT:    add z0.s, z0.s, z2.s
+; COMMON-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; COMMON-NEXT:    incw x8
+; COMMON-NEXT:    cmp x2, x8
+; COMMON-NEXT:    b.ne .LBB5_1
+; COMMON-NEXT:  // %bb.2: // %for.exit
+; COMMON-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %mul = mul nuw nsw i64 %vscale, 16
+  %mul2 = mul nuw nsw i64 %vscale, 16
+  %vl = mul nuw nsw i64 %vscale, 4
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %src.ptr = getelementptr inbounds i32, ptr %src, i64 %iv
+  %data = load <vscale x 4 x i32>, ptr %src.ptr
+  %src.ptr.sc_off = getelementptr inbounds i32, ptr %src.ptr, i64 %mul
+  %data2 = load <vscale x 4 x i32>, ptr %src.ptr.sc_off
+  %src.ptr.sc_off2 = getelementptr inbounds i32, ptr %src.ptr.sc_off, i64 %mul2
+  %data3 = load <vscale x 4 x i32>, ptr %src.ptr.sc_off2
+  %add = add <vscale x 4 x i32> %data, %data2
+  %add2 = add <vscale x 4 x i32> %add, %data3
+  %dst.ptr = getelementptr i32, ptr %dst, i64 %iv
+  store <vscale x 4 x i32> %add2, ptr %dst.ptr
+  %inc = add nuw i64 %iv, %vl
+  %exit.cond = icmp eq i64 %inc, %count
+  br i1 %exit.cond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }

>From 557a0970eef95b33e30264433bb5f3350f1fdf70 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 15 May 2024 11:06:24 +0000
Subject: [PATCH 2/4] Convert LSR to use possibly-scalable Immediate type

---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  | 413 +++++++++++-------
 1 file changed, 257 insertions(+), 156 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3a98e257367b2..0b77b7b5a62de 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -247,6 +247,68 @@ class RegSortData {
   void dump() const;
 };
 
+// An offset from an address that is either scalable or fixed. Used for
+// per-target optimizations of addressing modes.
+class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
+  constexpr Immediate(ScalarTy MinVal, bool Scalable)
+      : FixedOrScalableQuantity(MinVal, Scalable) {}
+
+  constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
+      : FixedOrScalableQuantity(V) {}
+
+public:
+  constexpr Immediate() : FixedOrScalableQuantity() {}
+
+  static constexpr Immediate getFixed(ScalarTy MinVal) {
+    return Immediate(MinVal, false);
+  }
+  static constexpr Immediate getScalable(ScalarTy MinVal) {
+    return Immediate(MinVal, true);
+  }
+  static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
+    return Immediate(MinVal, Scalable);
+  }
+
+  constexpr bool isLessThanZero() const { return Quantity < 0; }
+
+  constexpr bool isGreaterThanZero() const { return Quantity > 0; }
+
+  constexpr bool isMin() const {
+    return Quantity == std::numeric_limits<ScalarTy>::min();
+  }
+
+  constexpr bool isMax() const {
+    return Quantity == std::numeric_limits<ScalarTy>::max();
+  }
+};
+
+// This is needed for the Compare type of std::map when Immediate is used
+// as a key. We don't need it to be fully correct against any value of vscale,
+// just to make sure that vscale-related terms in the map are considered against
+// each other rather than being mixed up and potentially missing opportunities.
+struct KeyOrderTargetImmediate {
+  bool operator()(const Immediate &LHS, const Immediate &RHS) const {
+    if (LHS.isScalable() && !RHS.isScalable())
+      return false;
+    if (!LHS.isScalable() && RHS.isScalable())
+      return true;
+    return LHS.getKnownMinValue() < RHS.getKnownMinValue();
+  }
+};
+
+// This would be nicer if we could be generic instead of directly using size_t,
+// but there doesn't seem to be a type trait for is_orderable or
+// is_lessthan_comparable or similar.
+struct KeyOrderSizeTAndImmediate {
+  bool operator()(const std::pair<size_t, Immediate> &LHS,
+                  const std::pair<size_t, Immediate> &RHS) const {
+    size_t LSize = LHS.first;
+    size_t RSize = RHS.first;
+    if (LSize != RSize)
+      return LSize < RSize;
+    return KeyOrderTargetImmediate()(LHS.second, RHS.second);
+  }
+};
 } // end anonymous namespace
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -357,7 +419,7 @@ struct Formula {
   GlobalValue *BaseGV = nullptr;
 
   /// Base offset for complex addressing.
-  int64_t BaseOffset = 0;
+  Immediate BaseOffset;
 
   /// Whether any complex addressing has a base register.
   bool HasBaseReg = false;
@@ -388,7 +450,7 @@ struct Formula {
   /// An additional constant offset which added near the use. This requires a
   /// temporary register, but the offset itself can live in an add immediate
   /// field rather than a register.
-  int64_t UnfoldedOffset = 0;
+  Immediate UnfoldedOffset;
 
   Formula() = default;
 
@@ -628,7 +690,7 @@ void Formula::print(raw_ostream &OS) const {
     if (!First) OS << " + "; else First = false;
     BaseGV->printAsOperand(OS, /*PrintType=*/false);
   }
-  if (BaseOffset != 0) {
+  if (BaseOffset.isNonZero()) {
     if (!First) OS << " + "; else First = false;
     OS << BaseOffset;
   }
@@ -652,7 +714,7 @@ void Formula::print(raw_ostream &OS) const {
       OS << "<unknown>";
     OS << ')';
   }
-  if (UnfoldedOffset != 0) {
+  if (UnfoldedOffset.isNonZero()) {
     if (!First) OS << " + ";
     OS << "imm(" << UnfoldedOffset << ')';
   }
@@ -798,28 +860,28 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
 
 /// If S involves the addition of a constant integer value, return that integer
 /// value, and mutate S to point to a new SCEV with that value excluded.
-static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
     if (C->getAPInt().getSignificantBits() <= 64) {
       S = SE.getConstant(C->getType(), 0);
-      return C->getValue()->getSExtValue();
+      return Immediate::getFixed(C->getValue()->getSExtValue());
     }
   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(Add->operands());
-    int64_t Result = ExtractImmediate(NewOps.front(), SE);
-    if (Result != 0)
+    Immediate Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result.isNonZero())
       S = SE.getAddExpr(NewOps);
     return Result;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(AR->operands());
-    int64_t Result = ExtractImmediate(NewOps.front(), SE);
-    if (Result != 0)
+    Immediate Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result.isNonZero())
       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
                            SCEV::FlagAnyWrap);
     return Result;
   }
-  return 0;
+  return Immediate();
 }
 
 /// If S involves the addition of a GlobalValue address, return that symbol, and
@@ -1134,7 +1196,7 @@ struct LSRFixup {
   /// A constant offset to be added to the LSRUse expression.  This allows
   /// multiple fixups to share the same LSRUse with different offsets, for
   /// example in an unrolled loop.
-  int64_t Offset = 0;
+  Immediate Offset;
 
   LSRFixup() = default;
 
@@ -1197,8 +1259,10 @@ class LSRUse {
   SmallVector<LSRFixup, 8> Fixups;
 
   /// Keep track of the min and max offsets of the fixups.
-  int64_t MinOffset = std::numeric_limits<int64_t>::max();
-  int64_t MaxOffset = std::numeric_limits<int64_t>::min();
+  Immediate MinOffset =
+      Immediate::getFixed(std::numeric_limits<int64_t>::max());
+  Immediate MaxOffset =
+      Immediate::getFixed(std::numeric_limits<int64_t>::min());
 
   /// This records whether all of the fixups using this LSRUse are outside of
   /// the loop, in which case some special-case heuristics may be used.
@@ -1234,9 +1298,9 @@ class LSRUse {
 
   void pushFixup(LSRFixup &f) {
     Fixups.push_back(f);
-    if (f.Offset > MaxOffset)
+    if (Immediate::isKnownGT(f.Offset, MaxOffset))
       MaxOffset = f.Offset;
-    if (f.Offset < MinOffset)
+    if (Immediate::isKnownLT(f.Offset, MinOffset))
       MinOffset = f.Offset;
   }
 
@@ -1254,7 +1318,7 @@ class LSRUse {
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale,
                                  Instruction *Fixup = nullptr,
                                  int64_t ScalableOffset = 0);
@@ -1311,7 +1375,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
       // addressing.
       if (AMK == TTI::AMK_PreIndexed) {
         if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
-          if (Step->getAPInt() == F.BaseOffset)
+          if (Step->getAPInt() == F.BaseOffset.getFixedValue())
             LoopCost = 0;
       } else if (AMK == TTI::AMK_PostIndexed) {
         const SCEV *LoopStep = AR->getStepRecurrence(*SE);
@@ -1402,24 +1466,25 @@ void Cost::RateFormula(const Formula &F,
     // allows to fold 2 registers.
     C.NumBaseAdds +=
         NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
-  C.NumBaseAdds += (F.UnfoldedOffset != 0);
+  C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
 
   // Accumulate non-free scaling amounts.
   C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
 
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
-    int64_t O = Fixup.Offset;
-    int64_t Offset = (uint64_t)O + F.BaseOffset;
+    Immediate O = Fixup.Offset;
+    Immediate Offset = Immediate::getFixed((uint64_t)O.getFixedValue() +
+                                           F.BaseOffset.getFixedValue());
     if (F.BaseGV)
       C.ImmCost += 64; // Handle symbolic values conservatively.
                      // TODO: This should probably be the pointer size.
-    else if (Offset != 0)
-      C.ImmCost += APInt(64, Offset, true).getSignificantBits();
+    else if (Offset.isNonZero())
+      C.ImmCost += APInt(64, Offset.getFixedValue(), true).getSignificantBits();
 
     // Check with target if this offset with this instruction is
     // specifically not supported.
-    if (LU.Kind == LSRUse::Address && Offset != 0 &&
+    if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
         !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
                               Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
       C.NumBaseAdds++;
@@ -1547,7 +1612,7 @@ void LSRFixup::print(raw_ostream &OS) const {
     PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
   }
 
-  if (Offset != 0)
+  if (Offset.isNonZero())
     OS << ", Offset=" << Offset;
 }
 
@@ -1674,15 +1739,16 @@ LLVM_DUMP_METHOD void LSRUse::dump() const {
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale,
                                  Instruction *Fixup /* = nullptr */,
                                  int64_t ScalableOffset) {
   switch (Kind) {
   case LSRUse::Address:
-    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
-                                     HasBaseReg, Scale, AccessTy.AddrSpace,
-                                     Fixup, ScalableOffset);
+    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV,
+                                     BaseOffset.getFixedValue(), HasBaseReg,
+                                     Scale, AccessTy.AddrSpace, Fixup,
+                                     ScalableOffset);
 
   case LSRUse::ICmpZero:
     // There's not even a target hook for querying whether it would be legal to
@@ -1691,7 +1757,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
       return false;
 
     // ICmp only has two operands; don't allow more than two non-trivial parts.
-    if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+    if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
       return false;
 
     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
@@ -1701,7 +1767,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
-    if (BaseOffset != 0) {
+    if (BaseOffset.isNonZero()) {
       // We have one of:
       // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
@@ -1709,8 +1775,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
       if (Scale == 0)
         // The cast does the right thing with
         // std::numeric_limits<int64_t>::min().
-        BaseOffset = -(uint64_t)BaseOffset;
-      return TTI.isLegalICmpImmediate(BaseOffset);
+        BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
+      return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
     }
 
     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
@@ -1718,31 +1784,34 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 
   case LSRUse::Basic:
     // Only handle single-register values.
-    return !BaseGV && Scale == 0 && BaseOffset == 0 && ScalableOffset == 0;
+    return !BaseGV && Scale == 0 && BaseOffset.isZero();
 
   case LSRUse::Special:
     // Special case Basic to handle -1 scales.
-    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0 &&
-           ScalableOffset == 0;
+    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
   }
 
   llvm_unreachable("Invalid LSRUse Kind!");
 }
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
-                                 int64_t MinOffset, int64_t MaxOffset,
+                                 Immediate MinOffset, Immediate MaxOffset,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale) {
+  int64_t Base = BaseOffset.getFixedValue();
+  int64_t Min = MinOffset.getFixedValue();
+  int64_t Max = MaxOffset.getFixedValue();
   // Check for overflow.
-  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
-      (MinOffset > 0))
+  if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
     return false;
-  MinOffset = (uint64_t)BaseOffset + MinOffset;
-  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
-      (MaxOffset > 0))
+  Min = (uint64_t)Base + Min;
+  if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
     return false;
-  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+  Max = (uint64_t)Base + Max;
+
+  MinOffset = Immediate::getFixed(Min);
+  MaxOffset = Immediate::getFixed(Max);
 
   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
                               HasBaseReg, Scale) &&
@@ -1751,7 +1820,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
-                                 int64_t MinOffset, int64_t MaxOffset,
+                                 Immediate MinOffset, Immediate MaxOffset,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  const Formula &F, const Loop &L) {
   // For the purpose of isAMCompletelyFolded either having a canonical formula
@@ -1767,10 +1836,10 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }
 
 /// Test whether we know how to expand the current formula.
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
-                       int64_t MaxOffset, LSRUse::KindType Kind,
+static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
+                       Immediate MaxOffset, LSRUse::KindType Kind,
                        MemAccessTy AccessTy, GlobalValue *BaseGV,
-                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+                       Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
   // We know how to expand completely foldable formulae.
   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
                               BaseOffset, HasBaseReg, Scale) ||
@@ -1781,8 +1850,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
                                BaseGV, BaseOffset, true, 0));
 }
 
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
-                       int64_t MaxOffset, LSRUse::KindType Kind,
+static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
+                       Immediate MaxOffset, LSRUse::KindType Kind,
                        MemAccessTy AccessTy, const Formula &F) {
   return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
@@ -1819,15 +1888,15 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
 
   switch (LU.Kind) {
   case LSRUse::Address: {
+    int64_t FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
+    int64_t FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
     // Check the scaling factor cost with both the min and max offsets.
     InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
-        LU.AccessTy.MemTy, F.BaseGV,
-        StackOffset::getFixed(F.BaseOffset + LU.MinOffset), F.HasBaseReg,
-        F.Scale, LU.AccessTy.AddrSpace);
+        LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMin),
+        F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
     InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
-        LU.AccessTy.MemTy, F.BaseGV,
-        StackOffset::getFixed(F.BaseOffset + LU.MaxOffset), F.HasBaseReg,
-        F.Scale, LU.AccessTy.AddrSpace);
+        LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMax),
+        F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
 
     assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
            "Legal addressing mode has an illegal cost!");
@@ -1846,10 +1915,11 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
 
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              LSRUse::KindType Kind, MemAccessTy AccessTy,
-                             GlobalValue *BaseGV, int64_t BaseOffset,
+                             GlobalValue *BaseGV, Immediate BaseOffset,
                              bool HasBaseReg, int64_t ScalableOffset = 0) {
   // Fast-path: zero is always foldable.
-  if (BaseOffset == 0 && !BaseGV) return true;
+  if (BaseOffset.isZero() && !BaseGV)
+    return true;
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
@@ -1867,8 +1937,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
 }
 
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
-                             ScalarEvolution &SE, int64_t MinOffset,
-                             int64_t MaxOffset, LSRUse::KindType Kind,
+                             ScalarEvolution &SE, Immediate MinOffset,
+                             Immediate MaxOffset, LSRUse::KindType Kind,
                              MemAccessTy AccessTy, const SCEV *S,
                              bool HasBaseReg) {
   // Fast-path: zero is always foldable.
@@ -1876,14 +1946,15 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
-  int64_t BaseOffset = ExtractImmediate(S, SE);
+  Immediate BaseOffset = ExtractImmediate(S, SE);
   GlobalValue *BaseGV = ExtractSymbol(S, SE);
 
   // If there's anything else involved, it's not foldable.
   if (!S->isZero()) return false;
 
   // Fast-path: zero is always foldable.
-  if (BaseOffset == 0 && !BaseGV) return true;
+  if (BaseOffset.isZero() && !BaseGV)
+    return true;
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
@@ -2032,11 +2103,11 @@ class LSRInstance {
   using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
   UseMapTy UseMap;
 
-  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+  bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
                           LSRUse::KindType Kind, MemAccessTy AccessTy);
 
-  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
-                                    MemAccessTy AccessTy);
+  std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+                                      MemAccessTy AccessTy);
 
   void DeleteUse(LSRUse &LU, size_t LUIdx);
 
@@ -2062,7 +2133,7 @@ class LSRInstance {
   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
                                    const Formula &Base,
-                                   const SmallVectorImpl<int64_t> &Worklist,
+                                   const SmallVectorImpl<Immediate> &Worklist,
                                    size_t Idx, bool IsScaledReg = false);
   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
@@ -2570,11 +2641,11 @@ LSRInstance::OptimizeLoopTermCond() {
 
 /// Determine if the given use can accommodate a fixup at the given offset and
 /// other details. If so, update the use and return true.
-bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
                                      bool HasBaseReg, LSRUse::KindType Kind,
                                      MemAccessTy AccessTy) {
-  int64_t NewMinOffset = LU.MinOffset;
-  int64_t NewMaxOffset = LU.MaxOffset;
+  Immediate NewMinOffset = LU.MinOffset;
+  Immediate NewMaxOffset = LU.MaxOffset;
   MemAccessTy NewAccessTy = AccessTy;
 
   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
@@ -2594,12 +2665,12 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
   }
 
   // Conservatively assume HasBaseReg is true for now.
-  if (NewOffset < LU.MinOffset) {
+  if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
     if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
                           LU.MaxOffset - NewOffset, HasBaseReg))
       return false;
     NewMinOffset = NewOffset;
-  } else if (NewOffset > LU.MaxOffset) {
+  } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
     if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
                           NewOffset - LU.MinOffset, HasBaseReg))
       return false;
@@ -2616,17 +2687,17 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
 /// Return an LSRUse index and an offset value for a fixup which needs the given
 /// expression, with the given kind and optional access type.  Either reuse an
 /// existing use or create a new one, as needed.
-std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
-                                               LSRUse::KindType Kind,
-                                               MemAccessTy AccessTy) {
+std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
+                                                 LSRUse::KindType Kind,
+                                                 MemAccessTy AccessTy) {
   const SCEV *Copy = Expr;
-  int64_t Offset = ExtractImmediate(Expr, SE);
+  Immediate Offset = ExtractImmediate(Expr, SE);
 
   // Basic uses can't accept any offset, for example.
   if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
                         Offset, /*HasBaseReg=*/ true)) {
     Expr = Copy;
-    Offset = 0;
+    Offset = Immediate::getFixed(0);
   }
 
   std::pair<UseMapTy::iterator, bool> P =
@@ -2687,7 +2758,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
             F.BaseGV == OrigF.BaseGV &&
             F.Scale == OrigF.Scale &&
             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
-          if (F.BaseOffset == 0)
+          if (F.BaseOffset.isZero())
             return &LU;
           // This is the formula where all the registers and symbols matched;
           // there aren't going to be any others. Since we declined it, we
@@ -3192,7 +3263,8 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
 
   MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
   if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
-                        IncOffset, /*HasBaseReg=*/false, ScalableOffset))
+                        Immediate::getFixed(IncOffset), /*HasBaseReg=*/false,
+                        ScalableOffset))
     return false;
 
   return true;
@@ -3424,9 +3496,9 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     }
 
     // Get or create an LSRUse.
-    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+    std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
     size_t LUIdx = P.first;
-    int64_t Offset = P.second;
+    Immediate Offset = P.second;
     LSRUse &LU = Uses[LUIdx];
 
     // Record the fixup.
@@ -3616,10 +3688,10 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
             continue;
         }
 
-        std::pair<size_t, int64_t> P = getUse(
-            S, LSRUse::Basic, MemAccessTy());
+        std::pair<size_t, Immediate> P =
+            getUse(S, LSRUse::Basic, MemAccessTy());
         size_t LUIdx = P.first;
-        int64_t Offset = P.second;
+        Immediate Offset = P.second;
         LSRUse &LU = Uses[LUIdx];
         LSRFixup &LF = LU.getNewFixup();
         LF.UserInst = const_cast<Instruction *>(UserInst);
@@ -3778,10 +3850,11 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
     // Add the remaining pieces of the add back into the new formula.
     const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
     if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
-        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
                                 InnerSumSC->getValue()->getZExtValue())) {
       F.UnfoldedOffset =
-          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+          Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
+                              InnerSumSC->getValue()->getZExtValue());
       if (IsScaledReg)
         F.ScaledReg = nullptr;
       else
@@ -3794,10 +3867,11 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
     // Add J as its own register, or an unfolded immediate.
     const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
     if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
-        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
                                 SC->getValue()->getZExtValue()))
       F.UnfoldedOffset =
-          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+          Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
+                              SC->getValue()->getZExtValue());
     else
       F.BaseRegs.push_back(*J);
     // We may have changed the number of register in base regs, adjust the
@@ -3838,7 +3912,8 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
                                        Formula Base) {
   // This method is only interesting on a plurality of registers.
   if (Base.BaseRegs.size() + (Base.Scale == 1) +
-      (Base.UnfoldedOffset != 0) <= 1)
+          (Base.UnfoldedOffset.isNonZero()) <=
+      1)
     return;
 
   // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
@@ -3889,9 +3964,9 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
   // registers collected.
   if (NewBase.UnfoldedOffset) {
     assert(CombinedIntegerType && "Missing a type for the unfolded offset");
-    Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
-                                 true));
-    NewBase.UnfoldedOffset = 0;
+    Ops.push_back(SE.getConstant(CombinedIntegerType,
+                                 NewBase.UnfoldedOffset.getFixedValue(), true));
+    NewBase.UnfoldedOffset = Immediate::getFixed(0);
     GenerateFormula(SE.getAddExpr(Ops));
   }
 }
@@ -3931,15 +4006,17 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
 /// Helper function for LSRInstance::GenerateConstantOffsets.
 void LSRInstance::GenerateConstantOffsetsImpl(
     LSRUse &LU, unsigned LUIdx, const Formula &Base,
-    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+    const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
 
-  auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
+  auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
     Formula F = Base;
-    F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+    F.BaseOffset = Immediate::getFixed(
+        (uint64_t)Base.BaseOffset.getFixedValue() - Offset.getFixedValue());
 
     if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
       // Add the offset to the base register.
-      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
+      const SCEV *NewG = SE.getAddExpr(
+          SE.getConstant(G->getType(), Offset.getFixedValue()), G);
       // If it cancelled out, drop the base register, otherwise update it.
       if (NewG->isZero()) {
         if (IsScaledReg) {
@@ -3975,21 +4052,22 @@ void LSRInstance::GenerateConstantOffsetsImpl(
         int64_t Step = StepInt.isNegative() ?
           StepInt.getSExtValue() : StepInt.getZExtValue();
 
-        for (int64_t Offset : Worklist) {
-          Offset -= Step;
+        for (Immediate Offset : Worklist) {
+          Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
           GenerateOffset(G, Offset);
         }
       }
     }
   }
-  for (int64_t Offset : Worklist)
+  for (Immediate Offset : Worklist)
     GenerateOffset(G, Offset);
 
-  int64_t Imm = ExtractImmediate(G, SE);
-  if (G->isZero() || Imm == 0)
+  Immediate Imm = ExtractImmediate(G, SE);
+  if (G->isZero() || Imm.isZero())
     return;
   Formula F = Base;
-  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+  F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
+                                     Imm.getFixedValue());
   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
     return;
   if (IsScaledReg) {
@@ -4008,7 +4086,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
                                           Formula Base) {
   // TODO: For now, just add the min and max offset, because it usually isn't
   // worthwhile looking at everything inbetween.
-  SmallVector<int64_t, 2> Worklist;
+  SmallVector<Immediate, 2> Worklist;
   Worklist.push_back(LU.MinOffset);
   if (LU.MaxOffset != LU.MinOffset)
     Worklist.push_back(LU.MaxOffset);
@@ -4048,27 +4126,29 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     if (!ConstantInt::isValueValidForType(IntTy, Factor))
       continue;
     // Check that the multiplication doesn't overflow.
-    if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
+    if (Base.BaseOffset.isMin() && Factor == -1)
       continue;
-    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+    Immediate NewBaseOffset =
+        Immediate::getFixed((uint64_t)Base.BaseOffset.getFixedValue() * Factor);
     assert(Factor != 0 && "Zero factor not expected!");
-    if (NewBaseOffset / Factor != Base.BaseOffset)
+    if (NewBaseOffset.getFixedValue() / Factor !=
+        Base.BaseOffset.getFixedValue())
       continue;
     // If the offset will be truncated at this use, check that it is in bounds.
     if (!IntTy->isPointerTy() &&
-        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
       continue;
 
     // Check that multiplying with the use offset doesn't overflow.
-    int64_t Offset = LU.MinOffset;
-    if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
+    Immediate Offset = LU.MinOffset;
+    if (Offset.isMin() && Factor == -1)
       continue;
-    Offset = (uint64_t)Offset * Factor;
-    if (Offset / Factor != LU.MinOffset)
+    Offset = Immediate::getFixed((uint64_t)Offset.getFixedValue() * Factor);
+    if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
       continue;
     // If the offset will be truncated at this use, check that it is in bounds.
     if (!IntTy->isPointerTy() &&
-        !ConstantInt::isValueValidForType(IntTy, Offset))
+        !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
       continue;
 
     Formula F = Base;
@@ -4079,7 +4159,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
       continue;
 
     // Compensate for the use having MinOffset built into it.
-    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+    F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
+                                       Offset.getFixedValue() -
+                                       LU.MinOffset.getFixedValue());
 
     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
 
@@ -4098,16 +4180,17 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     }
 
     // Check that multiplying with the unfolded offset doesn't overflow.
-    if (F.UnfoldedOffset != 0) {
-      if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
-          Factor == -1)
+    if (F.UnfoldedOffset.isNonZero()) {
+      if (F.UnfoldedOffset.isMin() && Factor == -1)
         continue;
-      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
-      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+      F.UnfoldedOffset = Immediate::getFixed(
+          (uint64_t)F.UnfoldedOffset.getFixedValue() * Factor);
+      if (F.UnfoldedOffset.getFixedValue() / Factor !=
+          Base.UnfoldedOffset.getFixedValue())
         continue;
       // If the offset will be truncated, check that it is in bounds.
-      if (!IntTy->isPointerTy() &&
-          !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+      if (!IntTy->isPointerTy() && !ConstantInt::isValueValidForType(
+                                       IntTy, F.UnfoldedOffset.getFixedValue()))
         continue;
     }
 
@@ -4150,8 +4233,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
     }
     // For an ICmpZero, negating a solitary base register won't lead to
     // new solutions.
-    if (LU.Kind == LSRUse::ICmpZero &&
-        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+    if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
+        Base.BaseOffset.isZero() && !Base.BaseGV)
       continue;
     // For each addrec base reg, if its loop is current loop, apply the scale.
     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
@@ -4277,10 +4360,10 @@ namespace {
 /// structures moving underneath it.
 struct WorkItem {
   size_t LUIdx;
-  int64_t Imm;
+  Immediate Imm;
   const SCEV *OrigReg;
 
-  WorkItem(size_t LI, int64_t I, const SCEV *R)
+  WorkItem(size_t LI, Immediate I, const SCEV *R)
       : LUIdx(LI), Imm(I), OrigReg(R) {}
 
   void print(raw_ostream &OS) const;
@@ -4304,14 +4387,14 @@ LLVM_DUMP_METHOD void WorkItem::dump() const {
 /// opportunities between them.
 void LSRInstance::GenerateCrossUseConstantOffsets() {
   // Group the registers by their value without any added constant offset.
-  using ImmMapTy = std::map<int64_t, const SCEV *>;
+  using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
 
   DenseMap<const SCEV *, ImmMapTy> Map;
   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
   SmallVector<const SCEV *, 8> Sequence;
   for (const SCEV *Use : RegUses) {
     const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
-    int64_t Imm = ExtractImmediate(Reg, SE);
+    Immediate Imm = ExtractImmediate(Reg, SE);
     auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
     if (Pair.second)
       Sequence.push_back(Reg);
@@ -4323,7 +4406,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
   // a list of work to do and do the work in a separate step so that we're
   // not adding formulae and register counts while we're searching.
   SmallVector<WorkItem, 32> WorkItems;
-  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+  SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
+      UniqueItems;
   for (const SCEV *Reg : Sequence) {
     const ImmMapTy &Imms = Map.find(Reg)->second;
 
@@ -4342,7 +4426,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
          J != JE; ++J) {
       const SCEV *OrigReg = J->second;
 
-      int64_t JImm = J->first;
+      Immediate JImm = J->first;
       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
 
       if (!isa<SCEVConstant>(OrigReg) &&
@@ -4354,8 +4438,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
 
       // Conservatively examine offsets between this orig reg a few selected
       // other orig regs.
-      int64_t First = Imms.begin()->first;
-      int64_t Last = std::prev(Imms.end())->first;
+      int64_t First = Imms.begin()->first.getFixedValue();
+      int64_t Last = std::prev(Imms.end())->first.getFixedValue();
       // Compute (First + Last)  / 2 without overflow using the fact that
       // First + Last = 2 * (First + Last) + (First ^ Last).
       int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
@@ -4364,12 +4448,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
       ImmMapTy::const_iterator OtherImms[] = {
           Imms.begin(), std::prev(Imms.end()),
-         Imms.lower_bound(Avg)};
+          Imms.lower_bound(Immediate::getFixed(Avg))};
       for (const auto &M : OtherImms) {
         if (M == J || M == JE) continue;
 
         // Compute the difference between the two.
-        int64_t Imm = (uint64_t)JImm - M->first;
+        Immediate Imm = Immediate::getFixed((uint64_t)JImm.getFixedValue() -
+                                            M->first.getFixedValue());
         for (unsigned LUIdx : UsedByIndices.set_bits())
           // Make a memo of this use, offset, and register tuple.
           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
@@ -4387,11 +4472,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
   for (const WorkItem &WI : WorkItems) {
     size_t LUIdx = WI.LUIdx;
     LSRUse &LU = Uses[LUIdx];
-    int64_t Imm = WI.Imm;
+    Immediate Imm = WI.Imm;
     const SCEV *OrigReg = WI.OrigReg;
 
     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
-    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+    const SCEV *NegImmS =
+        SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getFixedValue()));
     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
 
     // TODO: Use a more targeted data structure.
@@ -4404,10 +4490,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       F.unscale();
       // Use the immediate in the scaled register.
       if (F.ScaledReg == OrigReg) {
-        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+        Immediate Offset =
+            Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
+                                Imm.getFixedValue() * (uint64_t)F.Scale);
         // Don't create 50 + reg(-50).
         if (F.referencesReg(SE.getSCEV(
-                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
+                ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue()))))
           continue;
         Formula NewF = F;
         NewF.BaseOffset = Offset;
@@ -4420,9 +4508,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
         // value to the immediate would produce a value closer to zero than the
         // immediate itself, then the formula isn't worthwhile.
         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
-          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+          if (C->getValue()->isNegative() !=
+                  (NewF.BaseOffset.isLessThanZero()) &&
               (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
-                  .ule(std::abs(NewF.BaseOffset)))
+                  .ule(std::abs(NewF.BaseOffset.getFixedValue())))
             continue;
 
         // OK, looks good.
@@ -4435,16 +4524,21 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           if (BaseReg != OrigReg)
             continue;
           Formula NewF = F;
-          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+          NewF.BaseOffset = Immediate::getFixed(
+              (uint64_t)NewF.BaseOffset.getFixedValue() + Imm.getFixedValue());
           if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
                           LU.Kind, LU.AccessTy, NewF)) {
             if (AMK == TTI::AMK_PostIndexed &&
                 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
               continue;
-            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+            if (!TTI.isLegalAddImmediate(
+                    (uint64_t)NewF.UnfoldedOffset.getFixedValue() +
+                    Imm.getFixedValue()))
               continue;
             NewF = F;
-            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+            NewF.UnfoldedOffset = Immediate::getFixed(
+                (uint64_t)NewF.UnfoldedOffset.getFixedValue() +
+                Imm.getFixedValue());
           }
           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
 
@@ -4453,11 +4547,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           // zero than the immediate itself, then the formula isn't worthwhile.
           for (const SCEV *NewReg : NewF.BaseRegs)
             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
-              if ((C->getAPInt() + NewF.BaseOffset)
+              if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
                       .abs()
-                      .slt(std::abs(NewF.BaseOffset)) &&
-                  (C->getAPInt() + NewF.BaseOffset).countr_zero() >=
-                      (unsigned)llvm::countr_zero<uint64_t>(NewF.BaseOffset))
+                      .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
+                  (C->getAPInt() + NewF.BaseOffset.getFixedValue())
+                          .countr_zero() >=
+                      (unsigned)llvm::countr_zero<uint64_t>(
+                          NewF.BaseOffset.getFixedValue()))
                 goto skip_formula;
 
           // Ok, looks good.
@@ -4651,7 +4747,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
             Formula NewF = F;
             //FIXME: Formulas should store bitwidth to do wrapping properly.
             //       See PR41034.
-            NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
+            NewF.BaseOffset =
+                Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
+                                    (uint64_t)C->getValue()->getSExtValue());
             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                 (I - F.BaseRegs.begin()));
             if (LU.HasFormulaWithSameRegs(NewF)) {
@@ -4707,7 +4805,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
     LSRUse &LU = Uses[LUIdx];
     for (const Formula &F : LU.Formulae) {
-      if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
+      if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
         continue;
 
       LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
@@ -5543,30 +5641,33 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
   }
 
   // Expand the immediate portion.
-  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
-  if (Offset != 0) {
+  Immediate Offset = Immediate::getFixed(
+      (uint64_t)F.BaseOffset.getFixedValue() + LF.Offset.getFixedValue());
+  if (Offset.isNonZero()) {
     if (LU.Kind == LSRUse::ICmpZero) {
       // The other interesting way of "folding" with an ICmpZero is to use a
       // negated immediate.
       if (!ICmpScaledV)
-        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+        ICmpScaledV =
+            ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
-        ICmpScaledV = ConstantInt::get(IntTy, Offset);
+        ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
       }
     } else {
       // Just add the immediate values. These again are expected to be matched
       // as part of the address.
-      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+      Ops.push_back(
+          SE.getUnknown(ConstantInt::getSigned(IntTy, Offset.getFixedValue())));
     }
   }
 
   // Expand the unfolded offset portion.
-  int64_t UnfoldedOffset = F.UnfoldedOffset;
-  if (UnfoldedOffset != 0) {
+  Immediate UnfoldedOffset = F.UnfoldedOffset;
+  if (UnfoldedOffset.isNonZero()) {
     // Just add the immediate values.
-    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
-                                                       UnfoldedOffset)));
+    Ops.push_back(SE.getUnknown(
+        ConstantInt::getSigned(IntTy, UnfoldedOffset.getFixedValue())));
   }
 
   // Emit instructions summing all the operands.
@@ -5602,7 +5703,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
              "ICmp does not support folding a global value and "
              "a scale at the same time!");
       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
-                                           -(uint64_t)Offset);
+                                           -(uint64_t)Offset.getFixedValue());
       if (C->getType() != OpTy) {
         C = ConstantFoldCastOperand(
             CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,

>From 6265205f73b92bebdd6e17d43ac4ee610bd9a255 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 14 May 2024 16:04:13 +0000
Subject: [PATCH 3/4] Scalable work

---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  | 245 +++++++++++++-----
 .../AArch64/vscale-fixups.ll                  | 245 ++++++++++--------
 2 files changed, 329 insertions(+), 161 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 0b77b7b5a62de..6fd1685a8fa76 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -197,6 +197,14 @@ static cl::opt<cl::boolOrDefault> AllowDropSolutionIfLessProfitable(
     "lsr-drop-solution", cl::Hidden,
     cl::desc("Attempt to drop solution if it is less profitable"));
 
+static cl::opt<bool> EnableVScaleImmediates(
+    "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
+    cl::desc("Enable analysis of vscale-relative immediates in LSR"));
+
+static cl::opt<bool> DropScaledForVScale(
+    "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
+    cl::desc("Avoid using scaled registers with vscale-relative addressing"));
+
 STATISTIC(NumTermFold,
           "Number of terminating condition fold recognized and performed");
 
@@ -273,6 +281,10 @@ class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
 
   constexpr bool isGreaterThanZero() const { return Quantity > 0; }
 
+  constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
+    return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
+  }
+
   constexpr bool isMin() const {
     return Quantity == std::numeric_limits<ScalarTy>::min();
   }
@@ -880,7 +892,13 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
                            SCEV::FlagAnyWrap);
     return Result;
-  }
+  } else if (EnableVScaleImmediates)
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
+        if (isa<SCEVVScale>(M->getOperand(1))) {
+          S = SE.getConstant(M->getType(), 0);
+          return Immediate::getScalable(C->getValue()->getSExtValue());
+        }
   return Immediate();
 }
 
@@ -1373,7 +1391,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
 
       // If the step size matches the base offset, we could use pre-indexed
       // addressing.
-      if (AMK == TTI::AMK_PreIndexed) {
+      if (AMK == TTI::AMK_PreIndexed && !F.BaseOffset.isScalable()) {
         if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
           if (Step->getAPInt() == F.BaseOffset.getFixedValue())
             LoopCost = 0;
@@ -1473,14 +1491,18 @@ void Cost::RateFormula(const Formula &F,
 
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
-    Immediate O = Fixup.Offset;
-    Immediate Offset = Immediate::getFixed((uint64_t)O.getFixedValue() +
-                                           F.BaseOffset.getFixedValue());
+    // FIXME: We probably want to noticeably increase the cost if the
+    // two offsets differ in scalability?
+    bool Scalable = Fixup.Offset.isScalable() || F.BaseOffset.isScalable();
+    int64_t O = Fixup.Offset.getKnownMinValue();
+    Immediate Offset = Immediate::get(
+        (uint64_t)(O) + F.BaseOffset.getKnownMinValue(), Scalable);
     if (F.BaseGV)
       C.ImmCost += 64; // Handle symbolic values conservatively.
                      // TODO: This should probably be the pointer size.
     else if (Offset.isNonZero())
-      C.ImmCost += APInt(64, Offset.getFixedValue(), true).getSignificantBits();
+      C.ImmCost +=
+          APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
 
     // Check with target if this offset with this instruction is
     // specifically not supported.
@@ -1744,12 +1766,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  Instruction *Fixup /* = nullptr */,
                                  int64_t ScalableOffset) {
   switch (Kind) {
-  case LSRUse::Address:
-    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV,
-                                     BaseOffset.getFixedValue(), HasBaseReg,
-                                     Scale, AccessTy.AddrSpace, Fixup,
-                                     ScalableOffset);
-
+  case LSRUse::Address: {
+    int64_t FixedOffset =
+        BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
+    int64_t ScalableOffset =
+        BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
+    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
+                                     HasBaseReg, Scale, AccessTy.AddrSpace,
+                                     Fixup, ScalableOffset);
+  }
   case LSRUse::ICmpZero:
     // There's not even a target hook for querying whether it would be legal to
     // fold a GV into an ICmp.
@@ -1768,6 +1793,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
     if (BaseOffset.isNonZero()) {
+      // We don't have an interface to query whether the target supports
+      // icmpzero against scalable quantities yet.
+      if (BaseOffset.isScalable())
+        return false;
+
       // We have one of:
       // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
@@ -1799,19 +1829,20 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale) {
-  int64_t Base = BaseOffset.getFixedValue();
-  int64_t Min = MinOffset.getFixedValue();
-  int64_t Max = MaxOffset.getFixedValue();
+  if (BaseOffset.isNonZero() &&
+      (BaseOffset.isScalable() != MinOffset.isScalable() ||
+       BaseOffset.isScalable() != MaxOffset.isScalable()))
+    return false;
   // Check for overflow.
+  int64_t Base = BaseOffset.getKnownMinValue();
+  int64_t Min = MinOffset.getKnownMinValue();
+  int64_t Max = MaxOffset.getKnownMinValue();
   if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
     return false;
-  Min = (uint64_t)Base + Min;
+  MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
   if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
     return false;
-  Max = (uint64_t)Base + Max;
-
-  MinOffset = Immediate::getFixed(Min);
-  MaxOffset = Immediate::getFixed(Max);
+  MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
 
   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
                               HasBaseReg, Scale) &&
@@ -1857,6 +1888,14 @@ static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
+static bool isLegalAddImmediate(const TargetTransformInfo &TTI,
+                                Immediate Offset) {
+  if (Offset.isScalable())
+    return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
+
+  return TTI.isLegalAddImmediate(Offset.getFixedValue());
+}
+
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F) {
   // Target may want to look at the user instructions.
@@ -1888,14 +1927,20 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
 
   switch (LU.Kind) {
   case LSRUse::Address: {
-    int64_t FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
-    int64_t FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
     // Check the scaling factor cost with both the min and max offsets.
+    int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
+    if (F.BaseOffset.isScalable()) {
+      ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
+      ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
+    } else {
+      FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
+      FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
+    }
     InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
-        LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMin),
+        LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
         F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
     InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
-        LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMax),
+        LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
         F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
 
     assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
@@ -1932,6 +1977,15 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
     HasBaseReg = true;
   }
 
+  // FIXME: Try with + without a scale? Maybe based on TTI?
+  // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
+  // default for many architectures, not just AArch64 SVE. More investigation
+  // needed later to determine if this should be used more widely than just
+  // on scalable types.
+  if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
+      AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
+    Scale = 0;
+
   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
                               HasBaseReg, Scale, nullptr, ScalableOffset);
 }
@@ -1956,6 +2010,9 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
   if (BaseOffset.isZero() && !BaseGV)
     return true;
 
+  if (BaseOffset.isScalable())
+    return false;
+
   // Conservatively, create an address with an immediate and a
   // base and a scale.
   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
@@ -2677,6 +2734,13 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
     NewMaxOffset = NewOffset;
   }
 
+  // FIXME: We should be able to handle some level of scalable offset support
+  // for 'void', but in order to get basic support up and running this is
+  // being left out.
+  if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
+      (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
+    return false;
+
   // Update the use.
   LU.MinOffset = NewMinOffset;
   LU.MaxOffset = NewMaxOffset;
@@ -4010,13 +4074,22 @@ void LSRInstance::GenerateConstantOffsetsImpl(
 
   auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
     Formula F = Base;
-    F.BaseOffset = Immediate::getFixed(
-        (uint64_t)Base.BaseOffset.getFixedValue() - Offset.getFixedValue());
+    if (Base.BaseOffset.isScalable() != Offset.isScalable() &&
+        Base.BaseOffset.isNonZero() && Offset.isNonZero())
+      return;
+    bool Scalable = Base.BaseOffset.isScalable() || Offset.isScalable();
+    F.BaseOffset = Immediate::get((uint64_t)Base.BaseOffset.getKnownMinValue() -
+                                      Offset.getKnownMinValue(),
+                                  Scalable);
 
     if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
       // Add the offset to the base register.
-      const SCEV *NewG = SE.getAddExpr(
-          SE.getConstant(G->getType(), Offset.getFixedValue()), G);
+      const SCEV *NewOffset =
+          SE.getConstant(G->getType(), Offset.getKnownMinValue());
+      if (Scalable)
+        NewOffset =
+            SE.getMulExpr(NewOffset, SE.getVScale(NewOffset->getType()));
+      const SCEV *NewG = SE.getAddExpr(NewOffset, G);
       // If it cancelled out, drop the base register, otherwise update it.
       if (NewG->isZero()) {
         if (IsScaledReg) {
@@ -4053,8 +4126,10 @@ void LSRInstance::GenerateConstantOffsetsImpl(
           StepInt.getSExtValue() : StepInt.getZExtValue();
 
         for (Immediate Offset : Worklist) {
-          Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
-          GenerateOffset(G, Offset);
+          if (!Offset.isScalable()) {
+            Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
+            GenerateOffset(G, Offset);
+          }
         }
       }
     }
@@ -4063,11 +4138,13 @@ void LSRInstance::GenerateConstantOffsetsImpl(
     GenerateOffset(G, Offset);
 
   Immediate Imm = ExtractImmediate(G, SE);
-  if (G->isZero() || Imm.isZero())
+  if (G->isZero() || Imm.isZero() ||
+      Base.BaseOffset.isScalable() != Imm.isScalable())
     return;
   Formula F = Base;
-  F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
-                                     Imm.getFixedValue());
+  F.BaseOffset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
+                                    Imm.getKnownMinValue(),
+                                Imm.isScalable());
   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
     return;
   if (IsScaledReg) {
@@ -4438,23 +4515,39 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
 
       // Conservatively examine offsets between this orig reg a few selected
       // other orig regs.
-      int64_t First = Imms.begin()->first.getFixedValue();
-      int64_t Last = std::prev(Imms.end())->first.getFixedValue();
+      Immediate First = Imms.begin()->first;
+      Immediate Last = std::prev(Imms.end())->first;
+      if (First.isScalable() != Last.isScalable() && First.isNonZero() &&
+          Last.isNonZero()) {
+        LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
+                          << "\n");
+        continue;
+      }
+      // Only scalable if both terms are scalable, or if one is scalable and
+      // the other is 0.
+      bool Scalable = First.isScalable() || Last.isScalable();
+      int64_t FI = First.getKnownMinValue();
+      int64_t LI = Last.getKnownMinValue();
       // Compute (First + Last)  / 2 without overflow using the fact that
       // First + Last = 2 * (First + Last) + (First ^ Last).
-      int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
-      // If the result is negative and First is odd and Last even (or vice versa),
+      int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
+      // If the result is negative and FI is odd and LI even (or vice versa),
       // we rounded towards -inf. Add 1 in that case, to round towards 0.
-      Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
+      Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
       ImmMapTy::const_iterator OtherImms[] = {
           Imms.begin(), std::prev(Imms.end()),
-          Imms.lower_bound(Immediate::getFixed(Avg))};
+          Imms.lower_bound(Immediate::get(Avg, Scalable))};
       for (const auto &M : OtherImms) {
         if (M == J || M == JE) continue;
+        if (JImm.isScalable() != M->first.isScalable() && JImm.isNonZero() &&
+            M->first.isNonZero())
+          continue;
 
         // Compute the difference between the two.
-        Immediate Imm = Immediate::getFixed((uint64_t)JImm.getFixedValue() -
-                                            M->first.getFixedValue());
+        bool Scalable = JImm.isScalable() || M->first.isScalable();
+        Immediate Imm = Immediate::get((uint64_t)JImm.getKnownMinValue() -
+                                           M->first.getKnownMinValue(),
+                                       Scalable);
         for (unsigned LUIdx : UsedByIndices.set_bits())
           // Make a memo of this use, offset, and register tuple.
           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
@@ -4477,7 +4570,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
 
     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
     const SCEV *NegImmS =
-        SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getFixedValue()));
+        SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getKnownMinValue()));
+    if (Imm.isScalable())
+      NegImmS = SE.getMulExpr(NegImmS, SE.getVScale(NegImmS->getType()));
     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
 
     // TODO: Use a more targeted data structure.
@@ -4490,12 +4585,20 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       F.unscale();
       // Use the immediate in the scaled register.
       if (F.ScaledReg == OrigReg) {
+        if (F.BaseOffset.isScalable() != Imm.isScalable() &&
+            F.BaseOffset.isNonZero() && Imm.isNonZero())
+          continue;
+        bool Scalable = F.BaseOffset.isScalable() || Imm.isScalable();
         Immediate Offset =
-            Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
-                                Imm.getFixedValue() * (uint64_t)F.Scale);
+            Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
+                               Imm.getKnownMinValue() * (uint64_t)F.Scale,
+                           Scalable);
         // Don't create 50 + reg(-50).
-        if (F.referencesReg(SE.getSCEV(
-                ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue()))))
+        const SCEV *S = SE.getSCEV(
+            ConstantInt::get(IntTy, -(uint64_t)Offset.getKnownMinValue()));
+        if (Scalable)
+          S = SE.getMulExpr(S, SE.getVScale(S->getType()));
+        if (F.referencesReg(S))
           continue;
         Formula NewF = F;
         NewF.BaseOffset = Offset;
@@ -4524,21 +4627,29 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           if (BaseReg != OrigReg)
             continue;
           Formula NewF = F;
-          NewF.BaseOffset = Immediate::getFixed(
-              (uint64_t)NewF.BaseOffset.getFixedValue() + Imm.getFixedValue());
+          if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
+              !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
+              !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
+            continue;
+          bool Scalable = NewF.BaseOffset.isScalable() || Imm.isScalable() ||
+                          NewF.UnfoldedOffset.isScalable();
+          NewF.BaseOffset =
+              Immediate::get((uint64_t)NewF.BaseOffset.getKnownMinValue() +
+                                 Imm.getKnownMinValue(),
+                             Scalable);
           if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
                           LU.Kind, LU.AccessTy, NewF)) {
             if (AMK == TTI::AMK_PostIndexed &&
                 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
               continue;
-            if (!TTI.isLegalAddImmediate(
-                    (uint64_t)NewF.UnfoldedOffset.getFixedValue() +
-                    Imm.getFixedValue()))
+            Immediate NewUnfoldedOffset = Immediate::get(
+                (uint64_t)NewF.UnfoldedOffset.getKnownMinValue() +
+                    Imm.getKnownMinValue(),
+                Scalable);
+            if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
               continue;
             NewF = F;
-            NewF.UnfoldedOffset = Immediate::getFixed(
-                (uint64_t)NewF.UnfoldedOffset.getFixedValue() +
-                Imm.getFixedValue());
+            NewF.UnfoldedOffset = NewUnfoldedOffset;
           }
           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
 
@@ -5640,9 +5751,17 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
     Ops.push_back(SE.getUnknown(FullV));
   }
 
+  // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
+  // out at this point, or should we generate a SCEV adding together mixed
+  // offsets?
+  assert((F.BaseOffset.isScalable() == LF.Offset.isScalable() ||
+          F.BaseOffset.isZero() || LF.Offset.isZero()) &&
+         "Expanding mismatched offsets\n");
+  bool Scalable = F.BaseOffset.isScalable() || LF.Offset.isScalable();
   // Expand the immediate portion.
-  Immediate Offset = Immediate::getFixed(
-      (uint64_t)F.BaseOffset.getFixedValue() + LF.Offset.getFixedValue());
+  Immediate Offset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
+                                        LF.Offset.getKnownMinValue(),
+                                    Scalable);
   if (Offset.isNonZero()) {
     if (LU.Kind == LSRUse::ICmpZero) {
       // The other interesting way of "folding" with an ICmpZero is to use a
@@ -5657,17 +5776,23 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
     } else {
       // Just add the immediate values. These again are expected to be matched
       // as part of the address.
-      Ops.push_back(
-          SE.getUnknown(ConstantInt::getSigned(IntTy, Offset.getFixedValue())));
+      const SCEV *SU = SE.getUnknown(
+          ConstantInt::getSigned(IntTy, Offset.getKnownMinValue()));
+      if (Scalable)
+        SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
+      Ops.push_back(SU);
     }
   }
 
   // Expand the unfolded offset portion.
   Immediate UnfoldedOffset = F.UnfoldedOffset;
   if (UnfoldedOffset.isNonZero()) {
+    const SCEV *SU = SE.getUnknown(
+        ConstantInt::getSigned(IntTy, UnfoldedOffset.getKnownMinValue()));
+    if (UnfoldedOffset.isScalable())
+      SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
     // Just add the immediate values.
-    Ops.push_back(SE.getUnknown(
-        ConstantInt::getSigned(IntTy, UnfoldedOffset.getFixedValue())));
+    Ops.push_back(SU);
   }
 
   // Emit instructions summing all the operands.
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 367806ac438b0..483955c1c57a0 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -62,26 +62,22 @@ for.exit:
 define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i64 %count) #0 {
 ; COMMON-LABEL: many_mulvl1_addressing:
 ; COMMON:       // %bb.0: // %entry
-; COMMON-NEXT:    mov x9, x1
 ; COMMON-NEXT:    ptrue p0.b
-; COMMON-NEXT:    add x10, x0, x2
-; COMMON-NEXT:    inch x9
 ; COMMON-NEXT:    ptrue p1.h
-; COMMON-NEXT:    mov x8, xzr
-; COMMON-NEXT:    addvl x11, x10, #1
-; COMMON-NEXT:    addvl x12, x0, #1
 ; COMMON-NEXT:  .LBB1_1: // %for.body
 ; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
-; COMMON-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
-; COMMON-NEXT:    ld1b { z1.b }, p0/z, [x10, x8]
+; COMMON-NEXT:    add x8, x0, x2
+; COMMON-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; COMMON-NEXT:    ld1b { z1.b }, p0/z, [x0, x2]
+; COMMON-NEXT:    ld1b { z2.b }, p0/z, [x0, #1, mul vl]
+; COMMON-NEXT:    ld1b { z3.b }, p0/z, [x8, #1, mul vl]
 ; COMMON-NEXT:    subs x3, x3, #1
-; COMMON-NEXT:    ld1b { z2.b }, p0/z, [x12, x8]
-; COMMON-NEXT:    ld1b { z3.b }, p0/z, [x11, x8]
+; COMMON-NEXT:    addvl x0, x0, #2
 ; COMMON-NEXT:    add z0.b, z0.b, z1.b
 ; COMMON-NEXT:    add z1.b, z2.b, z3.b
-; COMMON-NEXT:    st1b { z0.h }, p1, [x1, x8]
-; COMMON-NEXT:    st1b { z1.h }, p1, [x9, x8]
-; COMMON-NEXT:    addvl x8, x8, #2
+; COMMON-NEXT:    st1b { z0.h }, p1, [x1]
+; COMMON-NEXT:    st1b { z1.h }, p1, [x1, #1, mul vl]
+; COMMON-NEXT:    addvl x1, x1, #2
 ; COMMON-NEXT:    b.ne .LBB1_1
 ; COMMON-NEXT:  // %bb.2: // %for.exit
 ; COMMON-NEXT:    ret
@@ -121,55 +117,21 @@ for.exit:
 }
 
 define void @fixed_iv_scalable_offset(ptr %src, ptr %dst, i64 %count) #0 {
-; BASE-LABEL: fixed_iv_scalable_offset:
-; BASE:       // %bb.0: // %entry
-; BASE-NEXT:    ptrue p0.s
-; BASE-NEXT:  .LBB2_1: // %for.body
-; BASE-NEXT:    // =>This Inner Loop Header: Depth=1
-; BASE-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; BASE-NEXT:    ld1w { z1.s }, p0/z, [x0, #4, mul vl]
-; BASE-NEXT:    subs x2, x2, #4
-; BASE-NEXT:    add x0, x0, #16
-; BASE-NEXT:    add z0.s, z0.s, z1.s
-; BASE-NEXT:    st1w { z0.s }, p0, [x1]
-; BASE-NEXT:    add x1, x1, #16
-; BASE-NEXT:    b.ne .LBB2_1
-; BASE-NEXT:  // %bb.2: // %for.exit
-; BASE-NEXT:    ret
-;
-; PREINDEX-LABEL: fixed_iv_scalable_offset:
-; PREINDEX:       // %bb.0: // %entry
-; PREINDEX-NEXT:    ptrue p0.s
-; PREINDEX-NEXT:  .LBB2_1: // %for.body
-; PREINDEX-NEXT:    // =>This Inner Loop Header: Depth=1
-; PREINDEX-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; PREINDEX-NEXT:    ld1w { z1.s }, p0/z, [x0, #4, mul vl]
-; PREINDEX-NEXT:    subs x2, x2, #4
-; PREINDEX-NEXT:    add x0, x0, #16
-; PREINDEX-NEXT:    add z0.s, z0.s, z1.s
-; PREINDEX-NEXT:    st1w { z0.s }, p0, [x1]
-; PREINDEX-NEXT:    add x1, x1, #16
-; PREINDEX-NEXT:    b.ne .LBB2_1
-; PREINDEX-NEXT:  // %bb.2: // %for.exit
-; PREINDEX-NEXT:    ret
-;
-; POSTINDEX-LABEL: fixed_iv_scalable_offset:
-; POSTINDEX:       // %bb.0: // %entry
-; POSTINDEX-NEXT:    ptrue p0.s
-; POSTINDEX-NEXT:    addvl x8, x0, #4
-; POSTINDEX-NEXT:  .LBB2_1: // %for.body
-; POSTINDEX-NEXT:    // =>This Inner Loop Header: Depth=1
-; POSTINDEX-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; POSTINDEX-NEXT:    ld1w { z1.s }, p0/z, [x8]
-; POSTINDEX-NEXT:    subs x2, x2, #4
-; POSTINDEX-NEXT:    add x8, x8, #16
-; POSTINDEX-NEXT:    add x0, x0, #16
-; POSTINDEX-NEXT:    add z0.s, z0.s, z1.s
-; POSTINDEX-NEXT:    st1w { z0.s }, p0, [x1]
-; POSTINDEX-NEXT:    add x1, x1, #16
-; POSTINDEX-NEXT:    b.ne .LBB2_1
-; POSTINDEX-NEXT:  // %bb.2: // %for.exit
-; POSTINDEX-NEXT:    ret
+; COMMON-LABEL: fixed_iv_scalable_offset:
+; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    ptrue p0.s
+; COMMON-NEXT:  .LBB2_1: // %for.body
+; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; COMMON-NEXT:    ld1w { z1.s }, p0/z, [x0, #4, mul vl]
+; COMMON-NEXT:    subs x2, x2, #4
+; COMMON-NEXT:    add x0, x0, #16
+; COMMON-NEXT:    add z0.s, z0.s, z1.s
+; COMMON-NEXT:    st1w { z0.s }, p0, [x1]
+; COMMON-NEXT:    add x1, x1, #16
+; COMMON-NEXT:    b.ne .LBB2_1
+; COMMON-NEXT:  // %bb.2: // %for.exit
+; COMMON-NEXT:    ret
 entry:
   %vscale = tail call i64 @llvm.vscale.i64()
   %mul = shl nuw nsw i64 %vscale, 4
@@ -193,26 +155,66 @@ for.exit:
 }
 
 define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) #0 {
-; COMMON-LABEL: mixed_offsets_scalable_then_fixed:
-; COMMON:       // %bb.0: // %entry
-; COMMON-NEXT:    ptrue p0.s
-; COMMON-NEXT:    mov x8, xzr
-; COMMON-NEXT:    addvl x9, x0, #4
-; COMMON-NEXT:    mov x10, #8 // =0x8
-; COMMON-NEXT:  .LBB3_1: // %for.body
-; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
-; COMMON-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; COMMON-NEXT:    ld1w { z1.s }, p0/z, [x9, x8, lsl #2]
-; COMMON-NEXT:    add x11, x9, x8, lsl #2
-; COMMON-NEXT:    ld1w { z2.s }, p0/z, [x11, x10, lsl #2]
-; COMMON-NEXT:    add z0.s, z0.s, z1.s
-; COMMON-NEXT:    add z0.s, z0.s, z2.s
-; COMMON-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; COMMON-NEXT:    incw x8
-; COMMON-NEXT:    cmp x2, x8
-; COMMON-NEXT:    b.ne .LBB3_1
-; COMMON-NEXT:  // %bb.2: // %for.exit
-; COMMON-NEXT:    ret
+; BASE-LABEL: mixed_offsets_scalable_then_fixed:
+; BASE:       // %bb.0: // %entry
+; BASE-NEXT:    ptrue p0.s
+; BASE-NEXT:    addvl x8, x0, #4
+; BASE-NEXT:    mov x9, #8 // =0x8
+; BASE-NEXT:  .LBB3_1: // %for.body
+; BASE-NEXT:    // =>This Inner Loop Header: Depth=1
+; BASE-NEXT:    ld1w { z0.s }, p0/z, [x8, #-4, mul vl]
+; BASE-NEXT:    ld1w { z1.s }, p0/z, [x8]
+; BASE-NEXT:    decw x2
+; BASE-NEXT:    ld1w { z2.s }, p0/z, [x8, x9, lsl #2]
+; BASE-NEXT:    addvl x8, x8, #1
+; BASE-NEXT:    add z0.s, z0.s, z1.s
+; BASE-NEXT:    add z0.s, z0.s, z2.s
+; BASE-NEXT:    st1w { z0.s }, p0, [x1]
+; BASE-NEXT:    addvl x1, x1, #1
+; BASE-NEXT:    cbnz x2, .LBB3_1
+; BASE-NEXT:  // %bb.2: // %for.exit
+; BASE-NEXT:    ret
+;
+; PREINDEX-LABEL: mixed_offsets_scalable_then_fixed:
+; PREINDEX:       // %bb.0: // %entry
+; PREINDEX-NEXT:    ptrue p0.s
+; PREINDEX-NEXT:    addvl x8, x0, #4
+; PREINDEX-NEXT:    mov x9, #8 // =0x8
+; PREINDEX-NEXT:  .LBB3_1: // %for.body
+; PREINDEX-NEXT:    // =>This Inner Loop Header: Depth=1
+; PREINDEX-NEXT:    ld1w { z0.s }, p0/z, [x8, #-4, mul vl]
+; PREINDEX-NEXT:    ld1w { z1.s }, p0/z, [x8]
+; PREINDEX-NEXT:    decw x2
+; PREINDEX-NEXT:    ld1w { z2.s }, p0/z, [x8, x9, lsl #2]
+; PREINDEX-NEXT:    addvl x8, x8, #1
+; PREINDEX-NEXT:    add z0.s, z0.s, z1.s
+; PREINDEX-NEXT:    add z0.s, z0.s, z2.s
+; PREINDEX-NEXT:    st1w { z0.s }, p0, [x1]
+; PREINDEX-NEXT:    addvl x1, x1, #1
+; PREINDEX-NEXT:    cbnz x2, .LBB3_1
+; PREINDEX-NEXT:  // %bb.2: // %for.exit
+; PREINDEX-NEXT:    ret
+;
+; POSTINDEX-LABEL: mixed_offsets_scalable_then_fixed:
+; POSTINDEX:       // %bb.0: // %entry
+; POSTINDEX-NEXT:    ptrue p0.s
+; POSTINDEX-NEXT:    mov x8, xzr
+; POSTINDEX-NEXT:    addvl x9, x0, #4
+; POSTINDEX-NEXT:    mov x10, #8 // =0x8
+; POSTINDEX-NEXT:  .LBB3_1: // %for.body
+; POSTINDEX-NEXT:    // =>This Inner Loop Header: Depth=1
+; POSTINDEX-NEXT:    ld1w { z0.s }, p0/z, [x9, #-4, mul vl]
+; POSTINDEX-NEXT:    ld1w { z1.s }, p0/z, [x9]
+; POSTINDEX-NEXT:    ld1w { z2.s }, p0/z, [x9, x10, lsl #2]
+; POSTINDEX-NEXT:    addvl x9, x9, #1
+; POSTINDEX-NEXT:    add z0.s, z0.s, z1.s
+; POSTINDEX-NEXT:    add z0.s, z0.s, z2.s
+; POSTINDEX-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; POSTINDEX-NEXT:    incw x8
+; POSTINDEX-NEXT:    cmp x2, x8
+; POSTINDEX-NEXT:    b.ne .LBB3_1
+; POSTINDEX-NEXT:  // %bb.2: // %for.exit
+; POSTINDEX-NEXT:    ret
 entry:
   %vscale = tail call i64 @llvm.vscale.i64()
   %mul = shl nuw nsw i64 %vscale, 4
@@ -295,25 +297,66 @@ for.exit:
 ;; base (but in range of the other).
 ;;
 define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 {
-; COMMON-LABEL: three_access_wide_gap:
-; COMMON:       // %bb.0: // %entry
-; COMMON-NEXT:    ptrue p0.s
-; COMMON-NEXT:    mov x8, xzr
-; COMMON-NEXT:    addvl x9, x0, #8
-; COMMON-NEXT:    addvl x10, x0, #4
-; COMMON-NEXT:  .LBB5_1: // %for.body
-; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
-; COMMON-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; COMMON-NEXT:    ld1w { z1.s }, p0/z, [x10, x8, lsl #2]
-; COMMON-NEXT:    ld1w { z2.s }, p0/z, [x9, x8, lsl #2]
-; COMMON-NEXT:    add z0.s, z0.s, z1.s
-; COMMON-NEXT:    add z0.s, z0.s, z2.s
-; COMMON-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
-; COMMON-NEXT:    incw x8
-; COMMON-NEXT:    cmp x2, x8
-; COMMON-NEXT:    b.ne .LBB5_1
-; COMMON-NEXT:  // %bb.2: // %for.exit
-; COMMON-NEXT:    ret
+; BASE-LABEL: three_access_wide_gap:
+; BASE:       // %bb.0: // %entry
+; BASE-NEXT:    ptrue p0.s
+; BASE-NEXT:    rdvl x8, #8
+; BASE-NEXT:    ptrue p1.b
+; BASE-NEXT:  .LBB5_1: // %for.body
+; BASE-NEXT:    // =>This Inner Loop Header: Depth=1
+; BASE-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; BASE-NEXT:    ld1w { z1.s }, p0/z, [x0, #4, mul vl]
+; BASE-NEXT:    decw x2
+; BASE-NEXT:    ld1b { z2.b }, p1/z, [x0, x8]
+; BASE-NEXT:    addvl x0, x0, #1
+; BASE-NEXT:    add z0.s, z0.s, z1.s
+; BASE-NEXT:    add z0.s, z0.s, z2.s
+; BASE-NEXT:    st1w { z0.s }, p0, [x1]
+; BASE-NEXT:    addvl x1, x1, #1
+; BASE-NEXT:    cbnz x2, .LBB5_1
+; BASE-NEXT:  // %bb.2: // %for.exit
+; BASE-NEXT:    ret
+;
+; PREINDEX-LABEL: three_access_wide_gap:
+; PREINDEX:       // %bb.0: // %entry
+; PREINDEX-NEXT:    ptrue p0.s
+; PREINDEX-NEXT:    rdvl x8, #8
+; PREINDEX-NEXT:    ptrue p1.b
+; PREINDEX-NEXT:  .LBB5_1: // %for.body
+; PREINDEX-NEXT:    // =>This Inner Loop Header: Depth=1
+; PREINDEX-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; PREINDEX-NEXT:    ld1w { z1.s }, p0/z, [x0, #4, mul vl]
+; PREINDEX-NEXT:    decw x2
+; PREINDEX-NEXT:    ld1b { z2.b }, p1/z, [x0, x8]
+; PREINDEX-NEXT:    addvl x0, x0, #1
+; PREINDEX-NEXT:    add z0.s, z0.s, z1.s
+; PREINDEX-NEXT:    add z0.s, z0.s, z2.s
+; PREINDEX-NEXT:    st1w { z0.s }, p0, [x1]
+; PREINDEX-NEXT:    addvl x1, x1, #1
+; PREINDEX-NEXT:    cbnz x2, .LBB5_1
+; PREINDEX-NEXT:  // %bb.2: // %for.exit
+; PREINDEX-NEXT:    ret
+;
+; POSTINDEX-LABEL: three_access_wide_gap:
+; POSTINDEX:       // %bb.0: // %entry
+; POSTINDEX-NEXT:    ptrue p0.s
+; POSTINDEX-NEXT:    mov x8, xzr
+; POSTINDEX-NEXT:    rdvl x9, #8
+; POSTINDEX-NEXT:    ptrue p1.b
+; POSTINDEX-NEXT:  .LBB5_1: // %for.body
+; POSTINDEX-NEXT:    // =>This Inner Loop Header: Depth=1
+; POSTINDEX-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; POSTINDEX-NEXT:    ld1w { z1.s }, p0/z, [x0, #4, mul vl]
+; POSTINDEX-NEXT:    ld1b { z2.b }, p1/z, [x0, x9]
+; POSTINDEX-NEXT:    addvl x0, x0, #1
+; POSTINDEX-NEXT:    add z0.s, z0.s, z1.s
+; POSTINDEX-NEXT:    add z0.s, z0.s, z2.s
+; POSTINDEX-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; POSTINDEX-NEXT:    incw x8
+; POSTINDEX-NEXT:    cmp x2, x8
+; POSTINDEX-NEXT:    b.ne .LBB5_1
+; POSTINDEX-NEXT:  // %bb.2: // %for.exit
+; POSTINDEX-NEXT:    ret
 entry:
   %vscale = tail call i64 @llvm.vscale.i64()
   %mul = mul nuw nsw i64 %vscale, 16

>From f5e855534faafb6ca48a17576ac54d3b3603af6f Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 18 Jun 2024 09:44:06 +0000
Subject: [PATCH 4/4] Add isFixed, getZero, getScalableFixedMin/Max convenience
 methods

---
 llvm/include/llvm/Support/TypeSize.h          |   3 +
 .../Transforms/Scalar/LoopStrengthReduce.cpp  | 259 +++++++++---------
 2 files changed, 140 insertions(+), 122 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index c6779e258be7c..bae833ecca7d4 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -170,6 +170,9 @@ template <typename LeafTy, typename ValueTy> class FixedOrScalableQuantity {
   /// Returns whether the quantity is scaled by a runtime quantity (vscale).
   constexpr bool isScalable() const { return Scalable; }
 
+  /// Returns true if the quantity is not scaled by vscale.
+  constexpr bool isFixed() const { return !Scalable; }
+
   /// A return value of true indicates we know at compile time that the number
   /// of elements (vscale * Min) is definitely even. However, returning false
   /// does not guarantee that the total number of elements is odd.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 6fd1685a8fa76..810c6b68032fa 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -265,16 +265,29 @@ class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
       : FixedOrScalableQuantity(V) {}
 
 public:
-  constexpr Immediate() : FixedOrScalableQuantity() {}
+  constexpr Immediate() = delete;
 
   static constexpr Immediate getFixed(ScalarTy MinVal) {
-    return Immediate(MinVal, false);
+    return {MinVal, false};
   }
   static constexpr Immediate getScalable(ScalarTy MinVal) {
-    return Immediate(MinVal, true);
+    return {MinVal, true};
   }
   static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
-    return Immediate(MinVal, Scalable);
+    return {MinVal, Scalable};
+  }
+  static constexpr Immediate getZero() { return {0, false}; }
+  static constexpr Immediate getFixedMin() {
+    return {std::numeric_limits<int64_t>::min(), false};
+  }
+  static constexpr Immediate getFixedMax() {
+    return {std::numeric_limits<int64_t>::max(), false};
+  }
+  static constexpr Immediate getScalableMin() {
+    return {std::numeric_limits<int64_t>::min(), true};
+  }
+  static constexpr Immediate getScalableMax() {
+    return {std::numeric_limits<int64_t>::max(), true};
   }
 
   constexpr bool isLessThanZero() const { return Quantity < 0; }
@@ -292,6 +305,47 @@ class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
   constexpr bool isMax() const {
     return Quantity == std::numeric_limits<ScalarTy>::max();
   }
+
+  // Arithmetic 'operators' that cast to unsigned types first.
+  constexpr Immediate addUnsigned(const Immediate &RHS) const {
+    assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
+    ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
+    return {Value, Scalable || RHS.isScalable()};
+  }
+
+  constexpr Immediate subUnsigned(const Immediate &RHS) const {
+    assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
+    ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
+    return {Value, Scalable || RHS.isScalable()};
+  }
+
+  // Scale the quantity by a constant without caring about runtime scalability.
+  constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
+    ScalarTy Value = (uint64_t)Quantity * RHS;
+    return {Value, Scalable};
+  }
+
+  // Helpers for generating SCEVs with vscale terms where needed.
+  const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
+    const SCEV *S = SE.getConstant(Ty, Quantity);
+    if (Scalable)
+      S = SE.getMulExpr(S, SE.getVScale(S->getType()));
+    return S;
+  }
+
+  const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
+    const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
+    if (Scalable)
+      NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
+    return NegS;
+  }
+
+  const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
+    const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
+    if (Scalable)
+      SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
+    return SU;
+  }
 };
 
 // This is needed for the Compare type of std::map when Immediate is used
@@ -431,7 +485,7 @@ struct Formula {
   GlobalValue *BaseGV = nullptr;
 
   /// Base offset for complex addressing.
-  Immediate BaseOffset;
+  Immediate BaseOffset = Immediate::getZero();
 
   /// Whether any complex addressing has a base register.
   bool HasBaseReg = false;
@@ -462,7 +516,7 @@ struct Formula {
   /// An additional constant offset which added near the use. This requires a
   /// temporary register, but the offset itself can live in an add immediate
   /// field rather than a register.
-  Immediate UnfoldedOffset;
+  Immediate UnfoldedOffset = Immediate::getZero();
 
   Formula() = default;
 
@@ -899,7 +953,7 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
           S = SE.getConstant(M->getType(), 0);
           return Immediate::getScalable(C->getValue()->getSExtValue());
         }
-  return Immediate();
+  return Immediate::getZero();
 }
 
 /// If S involves the addition of a GlobalValue address, return that symbol, and
@@ -1214,7 +1268,7 @@ struct LSRFixup {
   /// A constant offset to be added to the LSRUse expression.  This allows
   /// multiple fixups to share the same LSRUse with different offsets, for
   /// example in an unrolled loop.
-  Immediate Offset;
+  Immediate Offset = Immediate::getZero();
 
   LSRFixup() = default;
 
@@ -1277,10 +1331,8 @@ class LSRUse {
   SmallVector<LSRFixup, 8> Fixups;
 
   /// Keep track of the min and max offsets of the fixups.
-  Immediate MinOffset =
-      Immediate::getFixed(std::numeric_limits<int64_t>::max());
-  Immediate MaxOffset =
-      Immediate::getFixed(std::numeric_limits<int64_t>::min());
+  Immediate MinOffset = Immediate::getFixedMax();
+  Immediate MaxOffset = Immediate::getFixedMin();
 
   /// This records whether all of the fixups using this LSRUse are outside of
   /// the loop, in which case some special-case heuristics may be used.
@@ -1338,8 +1390,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale,
-                                 Instruction *Fixup = nullptr,
-                                 int64_t ScalableOffset = 0);
+                                 Instruction *Fixup = nullptr);
 
 static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
   if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
@@ -1391,7 +1442,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
 
       // If the step size matches the base offset, we could use pre-indexed
       // addressing.
-      if (AMK == TTI::AMK_PreIndexed && !F.BaseOffset.isScalable()) {
+      if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) {
         if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
           if (Step->getAPInt() == F.BaseOffset.getFixedValue())
             LoopCost = 0;
@@ -1491,25 +1542,25 @@ void Cost::RateFormula(const Formula &F,
 
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
-    // FIXME: We probably want to noticeably increase the cost if the
-    // two offsets differ in scalability?
-    bool Scalable = Fixup.Offset.isScalable() || F.BaseOffset.isScalable();
-    int64_t O = Fixup.Offset.getKnownMinValue();
-    Immediate Offset = Immediate::get(
-        (uint64_t)(O) + F.BaseOffset.getKnownMinValue(), Scalable);
-    if (F.BaseGV)
-      C.ImmCost += 64; // Handle symbolic values conservatively.
-                     // TODO: This should probably be the pointer size.
-    else if (Offset.isNonZero())
-      C.ImmCost +=
-          APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
-
-    // Check with target if this offset with this instruction is
-    // specifically not supported.
-    if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
-        !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
-                              Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
-      C.NumBaseAdds++;
+    if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
+      Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
+      if (F.BaseGV)
+        C.ImmCost += 64; // Handle symbolic values conservatively.
+                         // TODO: This should probably be the pointer size.
+      else if (Offset.isNonZero())
+        C.ImmCost +=
+            APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
+
+      // Check with target if this offset with this instruction is
+      // specifically not supported.
+      if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
+          !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+                                Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
+        C.NumBaseAdds++;
+    } else {
+      // Incompatible immediate type, increase cost to avoid using
+      C.ImmCost += 2048;
+    }
   }
 
   // If we don't count instruction cost exit here.
@@ -1763,8 +1814,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale,
-                                 Instruction *Fixup /* = nullptr */,
-                                 int64_t ScalableOffset) {
+                                 Instruction *Fixup /* = nullptr */) {
   switch (Kind) {
   case LSRUse::Address: {
     int64_t FixedOffset =
@@ -1778,7 +1828,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
   case LSRUse::ICmpZero:
     // There's not even a target hook for querying whether it would be legal to
     // fold a GV into an ICmp.
-    if (BaseGV || ScalableOffset != 0)
+    if (BaseGV)
       return false;
 
     // ICmp only has two operands; don't allow more than two non-trivial parts.
@@ -1961,7 +2011,7 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              LSRUse::KindType Kind, MemAccessTy AccessTy,
                              GlobalValue *BaseGV, Immediate BaseOffset,
-                             bool HasBaseReg, int64_t ScalableOffset = 0) {
+                             bool HasBaseReg) {
   // Fast-path: zero is always foldable.
   if (BaseOffset.isZero() && !BaseGV)
     return true;
@@ -1987,7 +2037,7 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
     Scale = 0;
 
   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
-                              HasBaseReg, Scale, nullptr, ScalableOffset);
+                              HasBaseReg, Scale);
 }
 
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
@@ -3304,14 +3354,13 @@ void LSRInstance::FinalizeChain(IVChain &Chain) {
 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
                              Value *Operand, const TargetTransformInfo &TTI) {
   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
-  int64_t IncOffset = 0;
-  int64_t ScalableOffset = 0;
+  Immediate IncOffset = Immediate::getZero();
   if (IncConst) {
     if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
       return false;
-    IncOffset = IncConst->getValue()->getSExtValue();
+    IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
   } else {
-    // Look for mul(vscale, constant), to detect ScalableOffset.
+    // Look for mul(vscale, constant), to detect a scalable offset.
     auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr);
     if (!IncVScale || IncVScale->getNumOperands() != 2 ||
         !isa<SCEVVScale>(IncVScale->getOperand(1)))
@@ -3319,7 +3368,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
     auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0));
     if (!Scale || Scale->getType()->getScalarSizeInBits() > 64)
       return false;
-    ScalableOffset = Scale->getValue()->getSExtValue();
+    IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue());
   }
 
   if (!isAddressUse(TTI, UserInst, Operand))
@@ -3327,8 +3376,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
 
   MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
   if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
-                        Immediate::getFixed(IncOffset), /*HasBaseReg=*/false,
-                        ScalableOffset))
+                        IncOffset, /*HasBaseReg=*/false))
     return false;
 
   return true;
@@ -3911,6 +3959,9 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
       continue;
     Formula F = Base;
 
+    if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
+      continue;
+
     // Add the remaining pieces of the add back into the new formula.
     const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
     if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
@@ -4026,7 +4077,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
 
   // If we have an unfolded offset, generate a formula combining it with the
   // registers collected.
-  if (NewBase.UnfoldedOffset) {
+  if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
     assert(CombinedIntegerType && "Missing a type for the unfolded offset");
     Ops.push_back(SE.getConstant(CombinedIntegerType,
                                  NewBase.UnfoldedOffset.getFixedValue(), true));
@@ -4074,21 +4125,13 @@ void LSRInstance::GenerateConstantOffsetsImpl(
 
   auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
     Formula F = Base;
-    if (Base.BaseOffset.isScalable() != Offset.isScalable() &&
-        Base.BaseOffset.isNonZero() && Offset.isNonZero())
+    if (!Base.BaseOffset.isCompatibleImmediate(Offset))
       return;
-    bool Scalable = Base.BaseOffset.isScalable() || Offset.isScalable();
-    F.BaseOffset = Immediate::get((uint64_t)Base.BaseOffset.getKnownMinValue() -
-                                      Offset.getKnownMinValue(),
-                                  Scalable);
+    F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
 
     if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
       // Add the offset to the base register.
-      const SCEV *NewOffset =
-          SE.getConstant(G->getType(), Offset.getKnownMinValue());
-      if (Scalable)
-        NewOffset =
-            SE.getMulExpr(NewOffset, SE.getVScale(NewOffset->getType()));
+      const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
       const SCEV *NewG = SE.getAddExpr(NewOffset, G);
       // If it cancelled out, drop the base register, otherwise update it.
       if (NewG->isZero()) {
@@ -4126,7 +4169,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
           StepInt.getSExtValue() : StepInt.getZExtValue();
 
         for (Immediate Offset : Worklist) {
-          if (!Offset.isScalable()) {
+          if (Offset.isFixed()) {
             Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
             GenerateOffset(G, Offset);
           }
@@ -4139,12 +4182,10 @@ void LSRInstance::GenerateConstantOffsetsImpl(
 
   Immediate Imm = ExtractImmediate(G, SE);
   if (G->isZero() || Imm.isZero() ||
-      Base.BaseOffset.isScalable() != Imm.isScalable())
+      !Base.BaseOffset.isCompatibleImmediate(Imm))
     return;
   Formula F = Base;
-  F.BaseOffset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
-                                    Imm.getKnownMinValue(),
-                                Imm.isScalable());
+  F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
     return;
   if (IsScaledReg) {
@@ -4205,8 +4246,10 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     // Check that the multiplication doesn't overflow.
     if (Base.BaseOffset.isMin() && Factor == -1)
       continue;
-    Immediate NewBaseOffset =
-        Immediate::getFixed((uint64_t)Base.BaseOffset.getFixedValue() * Factor);
+    // Not supporting scalable immediates.
+    if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
+      continue;
+    Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
     assert(Factor != 0 && "Zero factor not expected!");
     if (NewBaseOffset.getFixedValue() / Factor !=
         Base.BaseOffset.getFixedValue())
@@ -4220,7 +4263,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     Immediate Offset = LU.MinOffset;
     if (Offset.isMin() && Factor == -1)
       continue;
-    Offset = Immediate::getFixed((uint64_t)Offset.getFixedValue() * Factor);
+    Offset = Offset.mulUnsigned(Factor);
     if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
       continue;
     // If the offset will be truncated at this use, check that it is in bounds.
@@ -4236,9 +4279,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
       continue;
 
     // Compensate for the use having MinOffset built into it.
-    F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
-                                       Offset.getFixedValue() -
-                                       LU.MinOffset.getFixedValue());
+    F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
 
     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
 
@@ -4260,8 +4301,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     if (F.UnfoldedOffset.isNonZero()) {
       if (F.UnfoldedOffset.isMin() && Factor == -1)
         continue;
-      F.UnfoldedOffset = Immediate::getFixed(
-          (uint64_t)F.UnfoldedOffset.getFixedValue() * Factor);
+      F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
       if (F.UnfoldedOffset.getFixedValue() / Factor !=
           Base.UnfoldedOffset.getFixedValue())
         continue;
@@ -4517,8 +4557,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       // other orig regs.
       Immediate First = Imms.begin()->first;
       Immediate Last = std::prev(Imms.end())->first;
-      if (First.isScalable() != Last.isScalable() && First.isNonZero() &&
-          Last.isNonZero()) {
+      if (!First.isCompatibleImmediate(Last)) {
         LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
                           << "\n");
         continue;
@@ -4539,15 +4578,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           Imms.lower_bound(Immediate::get(Avg, Scalable))};
       for (const auto &M : OtherImms) {
         if (M == J || M == JE) continue;
-        if (JImm.isScalable() != M->first.isScalable() && JImm.isNonZero() &&
-            M->first.isNonZero())
+        if (!JImm.isCompatibleImmediate(M->first))
           continue;
 
         // Compute the difference between the two.
-        bool Scalable = JImm.isScalable() || M->first.isScalable();
-        Immediate Imm = Immediate::get((uint64_t)JImm.getKnownMinValue() -
-                                           M->first.getKnownMinValue(),
-                                       Scalable);
+        Immediate Imm = JImm.subUnsigned(M->first);
         for (unsigned LUIdx : UsedByIndices.set_bits())
           // Make a memo of this use, offset, and register tuple.
           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
@@ -4569,10 +4604,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
     const SCEV *OrigReg = WI.OrigReg;
 
     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
-    const SCEV *NegImmS =
-        SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getKnownMinValue()));
-    if (Imm.isScalable())
-      NegImmS = SE.getMulExpr(NegImmS, SE.getVScale(NegImmS->getType()));
+    const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
 
     // TODO: Use a more targeted data structure.
@@ -4585,19 +4617,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       F.unscale();
       // Use the immediate in the scaled register.
       if (F.ScaledReg == OrigReg) {
-        if (F.BaseOffset.isScalable() != Imm.isScalable() &&
-            F.BaseOffset.isNonZero() && Imm.isNonZero())
+        if (!F.BaseOffset.isCompatibleImmediate(Imm))
           continue;
-        bool Scalable = F.BaseOffset.isScalable() || Imm.isScalable();
-        Immediate Offset =
-            Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
-                               Imm.getKnownMinValue() * (uint64_t)F.Scale,
-                           Scalable);
+        Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
         // Don't create 50 + reg(-50).
-        const SCEV *S = SE.getSCEV(
-            ConstantInt::get(IntTy, -(uint64_t)Offset.getKnownMinValue()));
-        if (Scalable)
-          S = SE.getMulExpr(S, SE.getVScale(S->getType()));
+        const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
         if (F.referencesReg(S))
           continue;
         Formula NewF = F;
@@ -4610,12 +4634,18 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
         // If the new scale is a constant in a register, and adding the constant
         // value to the immediate would produce a value closer to zero than the
         // immediate itself, then the formula isn't worthwhile.
-        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
+        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
+          // FIXME: Do we need to do something for scalable immediates here?
+          //        A scalable SCEV won't be constant, but we might still have
+          //        something in the offset? Bail out for now to be safe.
+          if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
+            continue;
           if (C->getValue()->isNegative() !=
                   (NewF.BaseOffset.isLessThanZero()) &&
               (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
                   .ule(std::abs(NewF.BaseOffset.getFixedValue())))
             continue;
+        }
 
         // OK, looks good.
         NewF.canonicalize(*this->L);
@@ -4631,21 +4661,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
               !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
               !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
             continue;
-          bool Scalable = NewF.BaseOffset.isScalable() || Imm.isScalable() ||
-                          NewF.UnfoldedOffset.isScalable();
-          NewF.BaseOffset =
-              Immediate::get((uint64_t)NewF.BaseOffset.getKnownMinValue() +
-                                 Imm.getKnownMinValue(),
-                             Scalable);
+          NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
           if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
                           LU.Kind, LU.AccessTy, NewF)) {
             if (AMK == TTI::AMK_PostIndexed &&
                 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
               continue;
-            Immediate NewUnfoldedOffset = Immediate::get(
-                (uint64_t)NewF.UnfoldedOffset.getKnownMinValue() +
-                    Imm.getKnownMinValue(),
-                Scalable);
+            Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
             if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
               continue;
             NewF = F;
@@ -4657,7 +4679,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           // constant value to the immediate would produce a value closer to
           // zero than the immediate itself, then the formula isn't worthwhile.
           for (const SCEV *NewReg : NewF.BaseRegs)
-            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
+            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
+              if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
+                goto skip_formula;
               if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
                       .abs()
                       .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
@@ -4666,6 +4690,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                       (unsigned)llvm::countr_zero<uint64_t>(
                           NewF.BaseOffset.getFixedValue()))
                 goto skip_formula;
+            }
 
           // Ok, looks good.
           NewF.canonicalize(*this->L);
@@ -4849,6 +4874,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
       bool Any = false;
       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
         Formula &F = LU.Formulae[i];
+        if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
+          continue;
         // Look for a formula with a constant or GV in a register. If the use
         // also has a formula with that same value in an immediate field,
         // delete the one that uses a register.
@@ -5754,14 +5781,10 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
   // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
   // out at this point, or should we generate a SCEV adding together mixed
   // offsets?
-  assert((F.BaseOffset.isScalable() == LF.Offset.isScalable() ||
-          F.BaseOffset.isZero() || LF.Offset.isZero()) &&
+  assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
          "Expanding mismatched offsets\n");
-  bool Scalable = F.BaseOffset.isScalable() || LF.Offset.isScalable();
   // Expand the immediate portion.
-  Immediate Offset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
-                                        LF.Offset.getKnownMinValue(),
-                                    Scalable);
+  Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
   if (Offset.isNonZero()) {
     if (LU.Kind == LSRUse::ICmpZero) {
       // The other interesting way of "folding" with an ICmpZero is to use a
@@ -5776,23 +5799,15 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
     } else {
       // Just add the immediate values. These again are expected to be matched
       // as part of the address.
-      const SCEV *SU = SE.getUnknown(
-          ConstantInt::getSigned(IntTy, Offset.getKnownMinValue()));
-      if (Scalable)
-        SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
-      Ops.push_back(SU);
+      Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
     }
   }
 
   // Expand the unfolded offset portion.
   Immediate UnfoldedOffset = F.UnfoldedOffset;
   if (UnfoldedOffset.isNonZero()) {
-    const SCEV *SU = SE.getUnknown(
-        ConstantInt::getSigned(IntTy, UnfoldedOffset.getKnownMinValue()));
-    if (UnfoldedOffset.isScalable())
-      SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
     // Just add the immediate values.
-    Ops.push_back(SU);
+    Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
   }
 
   // Emit instructions summing all the operands.