[clang] [llvm] [LLVM][SROA] Teach SROA how to "bitcast" between fixed and scalable vectors. (PR #130973)

Wed Mar 12 07:57:04 PDT 2025

https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/130973

>From 487a823a9ec35df1a93109ef03630738bdc39ab1 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 7 Mar 2025 11:54:20 +0000
Subject: [PATCH] [LLVM][SROA] Teach SROA how to "bitcast" between fixed and
 scalable vectors.

For function whose vscale_range is limited to a single value we can
size scalable vectors. This aids SROA by allowing scalable vector
load and store operations to be considered for replacement whereby
bitcasts through memory can be replaced by vector insert or extract
operations.
---
 .../attr-riscv-rvv-vector-bits-less-8-call.c  |  38 ++-
 .../attr-riscv-rvv-vector-bits-less-8-cast.c  |   8 +-
 .../CodeGen/RISCV/attr-rvv-vector-bits-cast.c |  16 +-
 .../CodeGen/attr-arm-sve-vector-bits-cast.c   |  23 +-
 llvm/include/llvm/IR/Attributes.h             |   4 +
 llvm/include/llvm/IR/DerivedTypes.h           |  16 ++
 llvm/lib/IR/AttributeImpl.h                   |   1 +
 llvm/lib/IR/Attributes.cpp                    |   8 +
 llvm/lib/Transforms/Scalar/SROA.cpp           | 130 ++++++---
 .../scalable-vectors-with-known-vscale.ll     | 248 ++++++++++++++++++
 llvm/test/Transforms/SROA/scalable-vectors.ll | 142 ++++++++++
 11 files changed, 563 insertions(+), 71 deletions(-)
 create mode 100644 llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll

diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
index e2f02dc64f766..66fd466eccfef 100644
--- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
+++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
@@ -26,11 +26,15 @@ typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_
 //
 // CHECK-128-LABEL: @call_bool32_ff(
 // CHECK-128-NEXT:  entry:
+// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
+// CHECK-128-NEXT:    [[SAVED_VALUE3:%.*]] = alloca <1 x i8>, align 1
 // CHECK-128-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2_COERCE:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10:![0-9]+]]
+// CHECK-128-NEXT:    [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-128-NEXT:    [[SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE3]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]], <vscale x 2 x i1> [[SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_]], i64 4)
+// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
@@ -52,11 +56,15 @@ fixed_bool32_t call_bool32_ff(fixed_bool32_t op1, fixed_bool32_t op2) {
 //
 // CHECK-128-LABEL: @call_bool64_ff(
 // CHECK-128-NEXT:  entry:
+// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
+// CHECK-128-NEXT:    [[SAVED_VALUE3:%.*]] = alloca <1 x i8>, align 1
 // CHECK-128-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 1 x i1>, align 1
 // CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2_COERCE:%.*]], i64 2)
+// CHECK-128-NEXT:    [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load <vscale x 1 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_:%.*]] = load <vscale x 1 x i1>, ptr [[SAVED_VALUE3]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]], <vscale x 1 x i1> [[SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_]], i64 2)
 // CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
@@ -82,11 +90,13 @@ fixed_bool64_t call_bool64_ff(fixed_bool64_t op1, fixed_bool64_t op2) {
 //
 // CHECK-128-LABEL: @call_bool32_fs(
 // CHECK-128-NEXT:  entry:
+// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-128-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
+// CHECK-128-NEXT:    [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
+// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA9]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
@@ -108,11 +118,13 @@ fixed_bool32_t call_bool32_fs(fixed_bool32_t op1, vbool32_t op2) {
 //
 // CHECK-128-LABEL: @call_bool64_fs(
 // CHECK-128-NEXT:  entry:
+// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-128-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 1 x i1>, align 1
 // CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
+// CHECK-128-NEXT:    [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load <vscale x 1 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
 // CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA11]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
@@ -141,8 +153,8 @@ fixed_bool64_t call_bool64_fs(fixed_bool64_t op1, vbool64_t op2) {
 // CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
+// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
@@ -168,7 +180,7 @@ fixed_bool32_t call_bool32_ss(vbool32_t op1, vbool32_t op2) {
 // CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
 // CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
 // CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
index f0fa7e8d07b4d..3785036380f53 100644
--- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
@@ -80,7 +80,9 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) {
 //
 // CHECK-128-LABEL: @to_vbool32_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
+// CHECK-128-NEXT:    [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]]
 //
 vbool32_t to_vbool32_t(fixed_bool32_t type) {
   return type;
@@ -116,7 +118,9 @@ fixed_bool64_t from_vbool64_t(vbool64_t type) {
 //
 // CHECK-128-LABEL: @to_vbool64_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
+// CHECK-128-NEXT:    [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load <vscale x 1 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]]
 //
 vbool64_t to_vbool64_t(fixed_bool64_t type) {
   return type;
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
index 7992951346d54..8764616eef232 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
@@ -99,8 +99,8 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10:![0-9]+]]
 // CHECK-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
@@ -111,7 +111,9 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) {
 
 // CHECK-LABEL: @to_vbool32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
+// CHECK-NEXT:    [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]]
 //
 vbool32_t to_vbool32_t(fixed_bool32_t type) {
   return type;
@@ -119,7 +121,7 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) {
 
 // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA10]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -130,7 +132,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA10]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
@@ -139,7 +141,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
 
 // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA10]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -150,7 +152,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA10]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) {
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
index e1e2220f94d6d..fcd4314249ff8 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
@@ -62,10 +62,7 @@ fixed_bool_t from_svbool_t(svbool_t type) {
 
 // CHECK-LABEL: @lax_cast(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <16 x i32>, align 64
-// CHECK-NEXT:    [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[TYPE_COERCE:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 svint64_t lax_cast(fixed_int32_t type) {
@@ -74,9 +71,9 @@ svint64_t lax_cast(fixed_int32_t type) {
 
 // CHECK-LABEL: @to_svint32_t__from_gnu_int32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) {
   return type;
@@ -84,8 +81,8 @@ svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) {
 
 // CHECK-LABEL: @from_svint32_t__to_gnu_int32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[CASTFIXEDSVE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE:%.*]], i64 0)
+// CHECK-NEXT:    store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) {
@@ -94,9 +91,9 @@ gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) {
 
 // CHECK-LABEL: @to_fixed_int32_t__from_gnu_int32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) {
   return type;
@@ -105,7 +102,7 @@ fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) {
 // CHECK-LABEL: @from_fixed_int32_t__to_gnu_int32_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) {
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index d6533b9bcbea1..efd03c6ec0e1f 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -276,6 +276,10 @@ class Attribute {
   /// when unknown.
   std::optional<unsigned> getVScaleRangeMax() const;
 
+  /// Return the value for vscale based on the vscale_range attribute or 0 when
+  /// unknown.
+  unsigned getVScaleValue() const;
+
   // Returns the unwind table kind.
   UWTableKind getUWTableKind() const;
 
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 60606d34c32c3..f8b79eea19cd2 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -554,6 +554,22 @@ class VectorType : public Type {
     return VectorType::get(VTy->getElementType(), EltCnt * 2);
   }
 
+  /// This static method returns a VectorType with the same size-in-bits as
+  /// SizeTy but with an element type that matches the scalar type of EltTy.
+  static VectorType *getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy) {
+    if (SizeTy->getScalarType() == EltTy->getScalarType())
+      return SizeTy;
+
+    unsigned EltSize = EltTy->getScalarSizeInBits();
+    if (!SizeTy->getPrimitiveSizeInBits().isKnownMultipleOf(EltSize))
+      return nullptr;
+
+    ElementCount EC = SizeTy->getElementCount()
+                          .multiplyCoefficientBy(SizeTy->getScalarSizeInBits())
+                          .divideCoefficientBy(EltSize);
+    return VectorType::get(EltTy->getScalarType(), EC);
+  }
+
   /// Return true if the specified type is valid as a element type.
   static bool isValidElementType(Type *ElemTy);
 
diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h
index 59cc489ade40d..42a5ab4c58f97 100644
--- a/llvm/lib/IR/AttributeImpl.h
+++ b/llvm/lib/IR/AttributeImpl.h
@@ -343,6 +343,7 @@ class AttributeSetNode final
       const;
   unsigned getVScaleRangeMin() const;
   std::optional<unsigned> getVScaleRangeMax() const;
+  unsigned getVScaleValue() const;
   UWTableKind getUWTableKind() const;
   AllocFnKind getAllocKind() const;
   MemoryEffects getMemoryEffects() const;
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 8da1dfe914818..2618e2561c02d 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -473,6 +473,14 @@ std::optional<unsigned> Attribute::getVScaleRangeMax() const {
   return unpackVScaleRangeArgs(pImpl->getValueAsInt()).second;
 }
 
+unsigned Attribute::getVScaleValue() const {
+  std::optional<unsigned> VScale = getVScaleRangeMax();
+  if (VScale && *VScale == getVScaleRangeMin())
+    return *VScale;
+
+  return 0;
+}
+
 UWTableKind Attribute::getUWTableKind() const {
   assert(hasAttribute(Attribute::UWTable) &&
          "Trying to get unwind table kind from non-uwtable attribute");
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 69e7ce83f82e4..2e758caa65c30 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1118,8 +1118,14 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
       return PI.setAborted(&LI);
 
     TypeSize Size = DL.getTypeStoreSize(LI.getType());
-    if (Size.isScalable())
-      return PI.setAborted(&LI);
+    if (Size.isScalable()) {
+      Attribute Attr = LI.getFunction()->getFnAttribute(Attribute::VScaleRange);
+      unsigned VScale = Attr.isValid() ? Attr.getVScaleValue() : 0;
+      if (!VScale)
+        return PI.setAborted(&LI);
+
+      Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
+    }
 
     return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
                              LI.isVolatile());
@@ -1133,8 +1139,14 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
       return PI.setAborted(&SI);
 
     TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
-    if (StoreSize.isScalable())
-      return PI.setAborted(&SI);
+    if (StoreSize.isScalable()) {
+      Attribute Attr = SI.getFunction()->getFnAttribute(Attribute::VScaleRange);
+      unsigned VScale = Attr.isValid() ? Attr.getVScaleValue() : 0;
+      if (!VScale)
+        return PI.setAborted(&SI);
+
+      StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
+    }
 
     uint64_t Size = StoreSize.getFixedValue();
 
@@ -1925,7 +1937,8 @@ static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
 /// ensure that we only try to convert viable values. The strategy is that we
 /// will peel off single element struct and array wrappings to get to an
 /// underlying value, and convert that value.
-static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
+                            unsigned VScale = 0) {
   if (OldTy == NewTy)
     return true;
 
@@ -1939,8 +1952,24 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
     return false;
   }
 
-  if (DL.getTypeSizeInBits(NewTy).getFixedValue() !=
-      DL.getTypeSizeInBits(OldTy).getFixedValue())
+  TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
+  TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
+
+  if (isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) {
+    if (!VScale || NewTy->isPtrOrPtrVectorTy() || OldTy->isPtrOrPtrVectorTy() ||
+        !VectorType::getWithSizeAndScalar(cast<VectorType>(NewTy), OldTy))
+      return false;
+
+    NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
+  } else if (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy)) {
+    if (!VScale || NewTy->isPtrOrPtrVectorTy() || OldTy->isPtrOrPtrVectorTy() ||
+        !VectorType::getWithSizeAndScalar(cast<VectorType>(OldTy), NewTy))
+      return false;
+
+    OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
+  }
+
+  if (NewSize != OldSize)
     return false;
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
     return false;
@@ -1990,7 +2019,15 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                            Type *NewTy) {
   Type *OldTy = V->getType();
-  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+#ifndef NDEBUG
+  BasicBlock *BB = IRB.GetInsertBlock();
+  assert(BB && BB->getParent() && "VScale unknown!");
+  Attribute Attr = BB->getParent()->getFnAttribute(Attribute::VScaleRange);
+  unsigned VScale = Attr.isValid() ? Attr.getVScaleValue() : 0;
+  assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
+         "Value not convertable to type");
+#endif
 
   if (OldTy == NewTy)
     return V;
@@ -2034,6 +2071,18 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
     }
   }
 
+  if (isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) {
+    auto *Ty = VectorType::getWithSizeAndScalar(cast<VectorType>(NewTy), OldTy);
+    V = IRB.CreateInsertVector(Ty, PoisonValue::get(Ty), V, IRB.getInt64(0));
+    return IRB.CreateBitCast(V, NewTy);
+  }
+
+  if (isa<FixedVectorType>(NewTy) && isa<ScalableVectorType>(OldTy)) {
+    auto *Ty = VectorType::getWithSizeAndScalar(cast<VectorType>(OldTy), NewTy);
+    V = IRB.CreateBitCast(V, Ty);
+    return IRB.CreateExtractVector(NewTy, V, IRB.getInt64(0));
+  }
+
   return IRB.CreateBitCast(V, NewTy);
 }
 
@@ -2044,7 +2093,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
                                             VectorType *Ty,
                                             uint64_t ElementSize,
-                                            const DataLayout &DL) {
+                                            const DataLayout &DL,
+                                            unsigned VScale) {
   // First validate the slice offsets.
   uint64_t BeginOffset =
       std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
@@ -2088,7 +2138,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
       assert(LTy->isIntegerTy());
       LTy = SplitIntTy;
     }
-    if (!canConvertValue(DL, SliceTy, LTy))
+    if (!canConvertValue(DL, SliceTy, LTy, VScale))
       return false;
   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
     if (SI->isVolatile())
@@ -2101,7 +2151,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
       assert(STy->isIntegerTy());
       STy = SplitIntTy;
     }
-    if (!canConvertValue(DL, STy, SliceTy))
+    if (!canConvertValue(DL, STy, SliceTy, VScale))
       return false;
   } else {
     return false;
@@ -2116,7 +2166,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
 /// (and thus isVectorPromotionViable) over all slices of the alloca for the
 /// given VectorType.
 static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
-                                        const DataLayout &DL) {
+                                        const DataLayout &DL, unsigned VScale) {
   uint64_t ElementSize =
       DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
 
@@ -2129,11 +2179,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
   ElementSize /= 8;
 
   for (const Slice &S : P)
-    if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+    if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
       return false;
 
   for (const Slice *S : P.splitSliceTails())
-    if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+    if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
       return false;
 
   return true;
@@ -2148,7 +2198,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
                              SmallVectorImpl<VectorType *> &CandidateTys,
                              bool HaveCommonEltTy, Type *CommonEltTy,
                              bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
-                             VectorType *CommonVecPtrTy) {
+                             VectorType *CommonVecPtrTy, unsigned VScale) {
   // If we didn't find a vector type, nothing to do here.
   if (CandidateTys.empty())
     return nullptr;
@@ -2224,7 +2274,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
   });
 
   for (VectorType *VTy : CandidateTys)
-    if (checkVectorTypeForPromotion(P, VTy, DL))
+    if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
       return VTy;
 
   return nullptr;
@@ -2235,7 +2285,7 @@ static VectorType *createAndCheckVectorTypesForPromotion(
     function_ref<void(Type *)> CheckCandidateType, Partition &P,
     const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
     bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
-    bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) {
+    bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
   [[maybe_unused]] VectorType *OriginalElt =
       CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
   // Consider additional vector types where the element type size is a
@@ -2260,9 +2310,9 @@ static VectorType *createAndCheckVectorTypesForPromotion(
     }
   }
 
-  return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
-                                      CommonEltTy, HaveVecPtrTy,
-                                      HaveCommonVecPtrTy, CommonVecPtrTy);
+  return checkVectorTypesForPromotion(
+      P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
+      HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
 }
 
 /// Test whether the given alloca partitioning and range of slices can be
@@ -2274,7 +2324,8 @@ static VectorType *createAndCheckVectorTypesForPromotion(
 /// SSA value. We only can ensure this for a limited set of operations, and we
 /// don't want to do the rewrites unless we are confident that the result will
 /// be promotable, so we have an early test here.
-static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
+                                           unsigned VScale) {
   // Collect the candidate types for vector-based promotion. Also track whether
   // we have different element types.
   SmallVector<VectorType *, 4> CandidateTys;
@@ -2286,7 +2337,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   bool HaveCommonEltTy = true;
   bool HaveCommonVecPtrTy = true;
   auto CheckCandidateType = [&](Type *Ty) {
-    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+    if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
       // Return if bitcast to vectors is different for total size in bits.
       if (!CandidateTys.empty()) {
         VectorType *V = CandidateTys[0];
@@ -2341,14 +2392,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   if (auto *VTy = createAndCheckVectorTypesForPromotion(
           LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
           CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
-          HaveCommonVecPtrTy, CommonVecPtrTy))
+          HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
     return VTy;
 
   CandidateTys.clear();
   return createAndCheckVectorTypesForPromotion(
       DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
       HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
-      CommonVecPtrTy);
+      CommonVecPtrTy, VScale);
 }
 
 /// Test whether a slice of an alloca is valid for integer widening.
@@ -2385,7 +2436,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (LI->isVolatile())
       return false;
     // We can't handle loads that extend past the allocated memory.
-    if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size)
+    TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
+    if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
       return false;
     // So far, AllocaSliceRewriter does not support widening split slice tails
     // in rewriteIntegerLoad.
@@ -2410,7 +2462,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (SI->isVolatile())
       return false;
     // We can't handle stores that extend past the allocated memory.
-    if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size)
+    TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
+    if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
       return false;
     // So far, AllocaSliceRewriter does not support widening split slice tails
     // in rewriteIntegerStore.
@@ -2883,8 +2936,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
-    const bool IsLoadPastEnd =
-        DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize;
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
@@ -2894,8 +2945,9 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     } else if (NewBeginOffset == NewAllocaBeginOffset &&
                NewEndOffset == NewAllocaEndOffset &&
                (canConvertValue(DL, NewAllocaTy, TargetTy) ||
-                (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
-                 TargetTy->isIntegerTy() && !LI.isVolatile()))) {
+                (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
+                 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
+                 !LI.isVolatile()))) {
       Value *NewPtr =
           getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
       LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
@@ -3068,7 +3120,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
         Pass.PostPromotionWorklist.insert(AI);
 
-    if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) {
+    TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
+    if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
       assert(!SI.isVolatile());
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
@@ -4844,14 +4897,19 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   Type *SliceTy = nullptr;
   VectorType *SliceVecTy = nullptr;
   const DataLayout &DL = AI.getDataLayout();
+  Attribute Attr = AI.getFunction()->getFnAttribute(Attribute::VScaleRange);
+  unsigned VScale = Attr.isValid() ? Attr.getVScaleValue() : 0;
+
   std::pair<Type *, IntegerType *> CommonUseTy =
       findCommonType(P.begin(), P.end(), P.endOffset());
   // Do all uses operate on the same type?
-  if (CommonUseTy.first)
-    if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) {
+  if (CommonUseTy.first) {
+    TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
+    if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
       SliceTy = CommonUseTy.first;
       SliceVecTy = dyn_cast<VectorType>(SliceTy);
     }
+  }
   // If not, can we find an appropriate subtype in the original allocated type?
   if (!SliceTy)
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
@@ -4872,12 +4930,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
 
   // If the common use types are not viable for promotion then attempt to find
   // another type that is viable.
-  if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL))
+  if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
                                                  P.beginOffset(), P.size())) {
       VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
       if (TypePartitionVecTy &&
-          checkVectorTypeForPromotion(P, TypePartitionVecTy, DL))
+          checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
         SliceTy = TypePartitionTy;
     }
 
@@ -4888,7 +4946,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
 
   VectorType *VecTy =
-      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
   if (VecTy)
     SliceTy = VecTy;
 
diff --git a/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll
new file mode 100644
index 0000000000000..b4df64a4e45c8
--- /dev/null
+++ b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
+; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+; This test checks that SROA runs mem2reg on scalable vectors.
+
+define <vscale x 16 x i1> @alloca_nxv16i1(<vscale x 16 x i1> %pg) vscale_range(1) {
+; CHECK-LABEL: @alloca_nxv16i1(
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[PG:%.*]]
+;
+  %pg.addr = alloca <vscale x 16 x i1>
+  store <vscale x 16 x i1> %pg, ptr %pg.addr
+  %1 = load <vscale x 16 x i1>, ptr %pg.addr
+  ret <vscale x 16 x i1> %1
+}
+
+define <vscale x 16 x i8> @alloca_nxv16i8(<vscale x 16 x i8> %vec) vscale_range(1) {
+; CHECK-LABEL: @alloca_nxv16i8(
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[VEC:%.*]]
+;
+  %vec.addr = alloca <vscale x 16 x i8>
+  store <vscale x 16 x i8> %vec, ptr %vec.addr
+  %1 = load <vscale x 16 x i8>, ptr %vec.addr
+  ret <vscale x 16 x i8> %1
+}
+
+; Test scalable alloca that can't be promoted. Mem2Reg only considers
+; non-volatile loads and stores for promotion.
+define <vscale x 16 x i8> @unpromotable_alloca(<vscale x 16 x i8> %vec) vscale_range(1) {
+; CHECK-LABEL: @unpromotable_alloca(
+; CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+; CHECK-NEXT:    store volatile <vscale x 16 x i8> [[VEC:%.*]], ptr [[VEC_ADDR]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile <vscale x 16 x i8>, ptr [[VEC_ADDR]], align 16
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+;
+  %vec.addr = alloca <vscale x 16 x i8>
+  store volatile <vscale x 16 x i8> %vec, ptr %vec.addr
+  %1 = load volatile <vscale x 16 x i8>, ptr %vec.addr
+  ret <vscale x 16 x i8> %1
+}
+
+; Test we bail out when using an alloca of a fixed-length vector (VLS) that was
+; bitcasted to a scalable vector.
+define <vscale x 4 x i32> @cast_alloca_to_svint32_t(<vscale x 4 x i32> %type.coerce) vscale_range(1) {
+; CHECK-LABEL: @cast_alloca_to_svint32_t(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE:%.*]], i64 0)
+; CHECK-NEXT:    [[TYPE_0_VEC_EXPAND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TYPE_0_VECBLEND:%.*]] = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i32> [[TYPE_0_VEC_EXPAND]], <16 x i32> undef
+; CHECK-NEXT:    [[TYPE_ADDR_0_VEC_EXTRACT:%.*]] = shufflevector <16 x i32> [[TYPE_0_VECBLEND]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> [[TYPE_ADDR_0_VEC_EXTRACT]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+;
+  %type = alloca <16 x i32>
+  %type.addr = alloca <16 x i32>
+  store <vscale x 4 x i32> %type.coerce, ptr %type
+  %type1 = load <16 x i32>, ptr %type
+  store <16 x i32> %type1, ptr %type.addr
+  %1 = load <16 x i32>, ptr %type.addr
+  %2 = load <vscale x 4 x i32>, ptr %type.addr
+  ret <vscale x 4 x i32> %2
+}
+
+; When casting from VLA to VLS via memory check we bail out when producing a
+; GEP where the element type is a scalable vector.
+define <vscale x 4 x i32> @cast_alloca_from_svint32_t() vscale_range(1) {
+; CHECK-LABEL: @cast_alloca_from_svint32_t(
+; CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <16 x i32> undef, ptr [[RETVAL_COERCE]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[RETVAL_COERCE]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %retval = alloca <16 x i32>
+  %retval.coerce = alloca <vscale x 4 x i32>
+  call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false)
+  %1 = load <vscale x 4 x i32>, ptr %retval.coerce
+  ret <vscale x 4 x i32> %1
+}
+
+; Test we bail out when using an alloca of a fixed-length vector (VLS) that was
+; bitcasted to a scalable vector.
+define void @select_load_alloca_to_svdouble_t() vscale_range(1) {
+; CHECK-LABEL: @select_load_alloca_to_svdouble_t(
+; CHECK-NEXT:    [[Z:%.*]] = alloca <16 x half>, align 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null
+; CHECK-NEXT:    [[VAL:%.*]] = load <vscale x 2 x double>, ptr [[COND]], align 16
+; CHECK-NEXT:    ret void
+;
+  %z = alloca <16 x half>
+  %cmp = icmp eq i32 0, 0
+  %cond = select i1 %cmp, ptr %z, ptr null
+  %val = load <vscale x 2 x double>, ptr %cond, align 16
+  ret void
+}
+
+define void @select_store_alloca_to_svdouble_t(<vscale x 2 x double> %val) vscale_range(1) {
+; CHECK-LABEL: @select_store_alloca_to_svdouble_t(
+; CHECK-NEXT:    [[Z:%.*]] = alloca <16 x half>, align 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null
+; CHECK-NEXT:    store <vscale x 2 x double> [[VAL:%.*]], ptr [[COND]], align 16
+; CHECK-NEXT:    ret void
+;
+  %z = alloca <16 x half>
+  %cmp = icmp eq i32 0, 0
+  %cond = select i1 %cmp, ptr %z, ptr null
+  store <vscale x 2 x double> %val, ptr %cond, align 16
+  ret void
+}
+
+define <4 x i32> @fixed_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> [[A:%.*]], i64 0)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %tmp = alloca <4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast(<vscale x 16 x i1> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 16 x i1> [[A:%.*]] to <vscale x 2 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <2 x i8> [[TMP2]]
+;
+  %tmp = alloca <2 x i8>
+  store <vscale x 16 x i1> %a, ptr %tmp
+  %cast = load <2 x i8>, ptr %tmp
+  ret <2 x i8> %cast
+}
+
+define <vscale x 4 x i32> @fixed_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> [[A:%.*]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %tmp = alloca <4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define <vscale x 16 x i1> @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[A:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[TMP1]] to <vscale x 16 x i1>
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+;
+  %tmp = alloca <2 x i8>
+  store <2 x i8> %a, ptr %tmp
+  %cast = load <vscale x 16 x i1>, ptr %tmp
+  ret <vscale x 16 x i1> %cast
+}
+
+define <4 x i32> @scalable_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @scalable_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[CAST]]
+;
+  %tmp = alloca <vscale x 4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <vscale x 4 x i32> @scalable_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @scalable_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <vscale x 4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST]]
+;
+  %tmp = alloca <vscale x 4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define i16 @scalar_alloca_scalar_from_scalable(<vscale x 16 x i1> %a) vscale_range(1) {
+; CHECK-LABEL: @scalar_alloca_scalar_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    store <vscale x 16 x i1> [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret i16 [[TMP_0_CAST]]
+;
+  %tmp = alloca i16
+  store <vscale x 16 x i1> %a, ptr %tmp
+  %cast = load i16, ptr %tmp
+  ret i16 %cast
+}
+
+define <vscale x 16 x i1> @scalar_alloca_scalable_from_scalar(i16 %a) vscale_range(1) {
+; CHECK-LABEL: @scalar_alloca_scalable_from_scalar(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    store i16 [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 16 x i1>, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP_0_CAST]]
+;
+  %tmp = alloca i16
+  store i16 %a, ptr %tmp
+  %cast = load <vscale x 16 x i1>, ptr %tmp
+  ret <vscale x 16 x i1> %cast
+}
+
+define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 8
+; CHECK-NEXT:    [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
+; CHECK-NEXT:    [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 8
+; CHECK-NEXT:    [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8
+; CHECK-NEXT:    [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1
+; CHECK-NEXT:    ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]]
+;
+  %tmp = alloca { <2 x i32>, <2 x i32> }
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp
+  ret { <2 x i32>, <2 x i32> } %cast
+}
+
+define <vscale x 4 x i64> @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16
+; CHECK-NEXT:    [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0
+; CHECK-NEXT:    store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1
+; CHECK-NEXT:    [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16
+; CHECK-NEXT:    store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 4 x i64>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP_0_CAST]]
+;
+  %tmp = alloca { <2 x ptr>, <2 x ptr> }
+  store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp
+  %cast = load <vscale x 4 x i64>, ptr %tmp
+  ret <vscale x 4 x i64> %cast
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-MODIFY-CFG: {{.*}}
+; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/SROA/scalable-vectors.ll b/llvm/test/Transforms/SROA/scalable-vectors.ll
index d892883ce9dc3..9d6dec34b35bd 100644
--- a/llvm/test/Transforms/SROA/scalable-vectors.ll
+++ b/llvm/test/Transforms/SROA/scalable-vectors.ll
@@ -2,6 +2,8 @@
 ; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
 ; This test checks that SROA runs mem2reg on scalable vectors.
 
 define <vscale x 16 x i1> @alloca_nxv16i1(<vscale x 16 x i1> %pg) {
@@ -110,6 +112,146 @@ define void @select_store_alloca_to_svdouble_t(<vscale x 2 x double> %val) {
   ret void
 }
 
+define <4 x i32> @fixed_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %tmp = alloca <4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast(<vscale x 16 x i1> %a) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i8>, align 2
+; CHECK-NEXT:    store <vscale x 16 x i1> [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret <2 x i8> [[TMP2]]
+;
+  %tmp = alloca <2 x i8>
+  store <vscale x 16 x i1> %a, ptr %tmp
+  %cast = load <2 x i8>, ptr %tmp
+  ret <2 x i8> %cast
+}
+
+define <vscale x 4 x i32> @fixed_alloca_scalable_from_fixed(<4 x i32> %a) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %tmp = alloca <4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define <vscale x 16 x i1> @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i8>, align 2
+; CHECK-NEXT:    store <2 x i8> [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+;
+  %tmp = alloca <2 x i8>
+  store <2 x i8> %a, ptr %tmp
+  %cast = load <vscale x 16 x i1>, ptr %tmp
+  ret <vscale x 16 x i1> %cast
+}
+
+define <4 x i32> @scalable_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @scalable_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[CAST]]
+;
+  %tmp = alloca <vscale x 4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <vscale x 4 x i32> @scalable_alloca_scalable_from_fixed(<4 x i32> %a) {
+; CHECK-LABEL: @scalable_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <vscale x 4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST]]
+;
+  %tmp = alloca <vscale x 4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define i16 @scalar_alloca_scalar_from_scalable(<vscale x 16 x i1> %a) {
+; CHECK-LABEL: @scalar_alloca_scalar_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    store <vscale x 16 x i1> [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret i16 [[TMP_0_CAST]]
+;
+  %tmp = alloca i16
+  store <vscale x 16 x i1> %a, ptr %tmp
+  %cast = load i16, ptr %tmp
+  ret i16 %cast
+}
+
+define <vscale x 16 x i1> @scalar_alloca_scalable_from_scalar(i16 %a) {
+; CHECK-LABEL: @scalar_alloca_scalable_from_scalar(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    store i16 [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 16 x i1>, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP_0_CAST]]
+;
+  %tmp = alloca i16
+  store i16 %a, ptr %tmp
+  %cast = load <vscale x 16 x i1>, ptr %tmp
+  ret <vscale x 16 x i1> %cast
+}
+
+define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[CAST_FCA_0_GEP]], align 8
+; CHECK-NEXT:    [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8
+; CHECK-NEXT:    [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1
+; CHECK-NEXT:    ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]]
+;
+  %tmp = alloca { <2 x i32>, <2 x i32> }
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp
+  ret { <2 x i32>, <2 x i32> } %cast
+}
+
+define <vscale x 4 x i64> @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) {
+; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16
+; CHECK-NEXT:    [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0
+; CHECK-NEXT:    [[A_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 0
+; CHECK-NEXT:    store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[A_FCA_0_GEP]], align 16
+; CHECK-NEXT:    [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1
+; CHECK-NEXT:    [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 1
+; CHECK-NEXT:    store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 4 x i64>, ptr [[TMP]], align 32
+; CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP_0_CAST]]
+;
+  %tmp = alloca { <2 x ptr>, <2 x ptr> }
+  store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp
+  %cast = load <vscale x 4 x i64>, ptr %tmp
+  ret <vscale x 4 x i64> %cast
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}