[clang] [CodeGen][RISCV] Use vscale_range to handle more fixed<->scalable casts of i1 vectors. (PR #138378)

Fri May 2 22:04:38 PDT 2025

https://github.com/topperc created https://github.com/llvm/llvm-project/pull/138378

RISC-V with -mrvv-vector-bits-min supports giving a size to our scalable vector types. To do this, we represent the vector as a fixed vector in memory and need to cast back and force to scable vectors.

For i1 vectors, we use an i8 vector in memory. If there are less than 8 bits we use a <1 x i8> vector with some undefined bits.

The cast code previously fell back to copying through memory if the known minimum size of the scable i1 was not divisible by 8. This used a <vscale x X x i1> load or store from a fixed vector alloca. If X is less than 8, DataLayout indicates that the load/store reads/writes vscale bytes even if vscale is known and vscale*X is less than or equal to 8. This means the load or store is outside the bounds of the fixed size alloca as far as DataLayout is concerned leading to undefined behavior.

This patch makes use of the known value of vscale_range to avoid casting through memory.

Hopefully this allows #130973 to proceed.

>From e86986fcfcd473e97addee85c708954b466fa7c4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 2 May 2025 12:35:26 -0700
Subject: [PATCH] [CodeGen][RISCV] Use vscale_range to handle more
 fixed<->scalable casts of i1 vectors.

RISC-V with -mrvv-vector-bits-min supports giving a size to our
scalable vector types. To do this, we represent the vector as a fixed
vector in memory and need to cast back and force to scable vectors.

For i1 vectors, we use an i8 vector in memory. If there are less
than 8 bits we use a <1 x i8> vector with some undefined bits.

The cast code previously fell back to copying through memory if the
known minimum size of the scable i1 was not divisible by 8. This
used a <vscale x X x i1> load or store from a fixed vector alloca.
If X is less than 8, DataLayout indicates that the load/store reads/writes
vscale bytes even if vscale is known and vscale*X is less than or equal to 8.
This means the load or store is outside the bounds of the fixed size
alloca as far as DataLayout is concerned leading to undefined behavior.

This patch makes use of the known value of vscale_range to avoid
casting through memory.

Hopefully this allows #130973 to proceed.
---
 clang/lib/CodeGen/CGCall.cpp                  |  55 +++++++++
 clang/lib/CodeGen/CGExprScalar.cpp            |  58 ++++++++++
 .../attr-riscv-rvv-vector-bits-less-8-call.c  | 104 +++---------------
 .../attr-riscv-rvv-vector-bits-less-8-cast.c  |  56 ++--------
 .../attr-rvv-vector-bits-bitcast-less-8.c     |  32 +++---
 .../CodeGen/RISCV/attr-rvv-vector-bits-cast.c |  18 +--
 .../RISCV/attr-rvv-vector-bits-codegen.c      |  33 +++---
 .../RISCV/attr-rvv-vector-bits-globals.c      |  12 +-
 8 files changed, 179 insertions(+), 189 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 82a24f7c295a2..2b8f46bad2ffe 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1378,6 +1378,35 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
           Result = CGF.Builder.CreateBitCast(Result, Ty);
         return Result;
       }
+
+      // If we are casting a fixed i8 vector to a scalable i1 predicate
+      // vector, and we weren't able to handle it above, try using what we know
+      // about vscale to insert a fixed i1 vector into the scalable vector.
+      if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
+          FixedSrcTy->getElementType()->isIntegerTy(8)) {
+        std::optional<std::pair<unsigned, unsigned>> VScaleRange =
+            CGF.getContext().getTargetInfo().getVScaleRange(CGF.getLangOpts(),
+                                                            false);
+        if (VScaleRange && VScaleRange->first == VScaleRange->second &&
+            VScaleRange->first <= FixedSrcTy->getNumElements() * 8) {
+          llvm::Value *Load = CGF.Builder.CreateLoad(Src);
+          unsigned VScale = VScaleRange->first;
+          llvm::Type *WideFixedTy =
+              llvm::FixedVectorType::get(ScalableDstTy->getElementType(),
+                                         FixedSrcTy->getNumElements() * 8);
+          Load = CGF.Builder.CreateBitCast(Load, WideFixedTy);
+          llvm::Type *FixedTy = llvm::FixedVectorType::get(
+              ScalableDstTy->getElementType(),
+              ScalableDstTy->getElementCount().getKnownMinValue() * VScale);
+          // If the fixed i8 vector is larger than the i1 vector, we need to
+          // extract.
+          if (FixedTy != WideFixedTy)
+            Load = CGF.Builder.CreateExtractVector(FixedTy, Load, uint64_t(0));
+          return CGF.Builder.CreateInsertVector(
+              ScalableDstTy, llvm::PoisonValue::get(ScalableDstTy), Load,
+              uint64_t(0));
+        }
+      }
     }
   }
 
@@ -1485,6 +1514,32 @@ CoerceScalableToFixed(CodeGenFunction &CGF, llvm::FixedVectorType *ToTy,
     V = CGF.Builder.CreateExtractVector(ToTy, V, uint64_t(0), "cast.fixed");
     return {V, true};
   }
+
+  // If we are casting a scalable i1 predicate vector to a fixed i8
+  // vector, and we weren't able to handle it above, try using what we know
+  // about vscale to extract a fixed i1 vector from the scalable vector.
+  if (FromTy->getElementType()->isIntegerTy(1) &&
+      ToTy->getElementType() == CGF.Builder.getInt8Ty()) {
+    std::optional<std::pair<unsigned, unsigned>> VScaleRange =
+        CGF.getContext().getTargetInfo().getVScaleRange(CGF.getLangOpts(),
+                                                        false);
+    if (VScaleRange && VScaleRange->first == VScaleRange->second &&
+        VScaleRange->first <= ToTy->getNumElements() * 8) {
+      unsigned VScale = VScaleRange->first;
+      llvm::Type *FixedTy = llvm::FixedVectorType::get(
+          FromTy->getElementType(),
+          FromTy->getElementCount().getKnownMinValue() * VScale);
+      V = CGF.Builder.CreateExtractVector(FixedTy, V, uint64_t(0));
+      llvm::Type *WideFixedTy = llvm::FixedVectorType::get(
+          FromTy->getElementType(), ToTy->getNumElements() * 8);
+      if (FixedTy != WideFixedTy)
+        V = CGF.Builder.CreateInsertVector(
+            WideFixedTy, llvm::PoisonValue::get(WideFixedTy), V, uint64_t(0));
+      V = CGF.Builder.CreateBitCast(V, ToTy);
+      return {V, true};
+    }
+  }
+
   return {V, false};
 }
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 15a6177746403..5db74628a1743 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2493,6 +2493,35 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
             Result = Builder.CreateBitCast(Result, DstTy);
           return Result;
         }
+
+        // If we are casting a fixed i8 vector to a scalable i1 predicate
+        // vector, and we weren't able to handle it above, try using what we
+        // know about vscale to insert a fixed i1 vector into the scalable
+        // vector.
+        if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
+            FixedSrcTy->getElementType()->isIntegerTy(8)) {
+          std::optional<std::pair<unsigned, unsigned>> VScaleRange =
+              CGF.getContext().getTargetInfo().getVScaleRange(CGF.getLangOpts(),
+                                                              false);
+          if (VScaleRange && VScaleRange->first == VScaleRange->second &&
+              VScaleRange->first <= FixedSrcTy->getNumElements() * 8) {
+            unsigned VScale = VScaleRange->first;
+            llvm::Type *WideFixedTy =
+                llvm::FixedVectorType::get(ScalableDstTy->getElementType(),
+                                           FixedSrcTy->getNumElements() * 8);
+            Src = Builder.CreateBitCast(Src, WideFixedTy);
+            llvm::Type *FixedTy = llvm::FixedVectorType::get(
+                ScalableDstTy->getElementType(),
+                ScalableDstTy->getElementCount().getKnownMinValue() * VScale);
+            // If the fixed i8 vector is larger than the i1 vector, we need to
+            // extract.
+            if (FixedTy != WideFixedTy)
+              Src = Builder.CreateExtractVector(FixedTy, Src, uint64_t(0));
+            return Builder.CreateInsertVector(
+                ScalableDstTy, llvm::PoisonValue::get(ScalableDstTy), Src,
+                uint64_t(0));
+          }
+        }
       }
     }
 
@@ -2514,6 +2543,35 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
         if (ScalableSrcTy->getElementType() == FixedDstTy->getElementType())
           return Builder.CreateExtractVector(DstTy, Src, uint64_t(0),
                                              "cast.fixed");
+
+        // If we are casting a scalable i1 predicate vector to a fixed i8
+        // vector, and we weren't able to handle it above, try using what we
+        // know about vscale to extract a fixed i1 vector from the scalable
+        // vector.
+        if (ScalableSrcTy->getElementType()->isIntegerTy(1) &&
+            FixedDstTy->getElementType()->isIntegerTy(8)) {
+          std::optional<std::pair<unsigned, unsigned>> VScaleRange =
+              CGF.getContext().getTargetInfo().getVScaleRange(CGF.getLangOpts(),
+                                                              false);
+          if (VScaleRange && VScaleRange->first == VScaleRange->second &&
+              VScaleRange->first <= FixedDstTy->getNumElements() * 8) {
+            unsigned VScale = VScaleRange->first;
+            llvm::Type *FixedTy = llvm::FixedVectorType::get(
+                ScalableSrcTy->getElementType(),
+                ScalableSrcTy->getElementCount().getKnownMinValue() * VScale);
+            Src = Builder.CreateExtractVector(FixedTy, Src, uint64_t(0));
+            llvm::Type *WideFixedTy =
+                llvm::FixedVectorType::get(ScalableSrcTy->getElementType(),
+                                           FixedDstTy->getNumElements() * 8);
+            // If the fixed i8 vector is larger than the i1 vector, we need to
+            // widen the i1 vector.
+            if (FixedTy != WideFixedTy)
+              Src = Builder.CreateInsertVector(
+                  WideFixedTy, llvm::PoisonValue::get(WideFixedTy), Src,
+                  uint64_t(0));
+            return Builder.CreateBitCast(Src, FixedDstTy);
+          }
+        }
       }
     }
 
diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
index e2f02dc64f766..3ab065d34bcfb 100644
--- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
+++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
@@ -15,24 +15,12 @@ typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_
 
 // CHECK-64-LABEL: @call_bool32_ff(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2_COERCE:%.*]], i64 2)
-// CHECK-64-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10:![0-9]+]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-64-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i1> [[TMP1:%.*]], i64 2)
 // CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
 //
 // CHECK-128-LABEL: @call_bool32_ff(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2_COERCE:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10:![0-9]+]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i1> [[TMP1:%.*]], i64 4)
 // CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
 //
 fixed_bool32_t call_bool32_ff(fixed_bool32_t op1, fixed_bool32_t op2) {
@@ -41,24 +29,12 @@ fixed_bool32_t call_bool32_ff(fixed_bool32_t op1, fixed_bool32_t op2) {
 
 // CHECK-64-LABEL: @call_bool64_ff(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2_COERCE:%.*]], i64 1)
-// CHECK-64-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-64-NEXT:    [[TMP2:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[TMP0:%.*]], <vscale x 1 x i1> [[TMP1:%.*]], i64 1)
 // CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
 //
 // CHECK-128-LABEL: @call_bool64_ff(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2_COERCE:%.*]], i64 2)
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[TMP0:%.*]], <vscale x 1 x i1> [[TMP1:%.*]], i64 2)
 // CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
 //
 fixed_bool64_t call_bool64_ff(fixed_bool64_t op1, fixed_bool64_t op2) {
@@ -71,25 +47,13 @@ fixed_bool64_t call_bool64_ff(fixed_bool64_t op1, fixed_bool64_t op2) {
 
 // CHECK-64-LABEL: @call_bool32_fs(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 2)
-// CHECK-64-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-64-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 2)
+// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
 //
 // CHECK-128-LABEL: @call_bool32_fs(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
 //
 fixed_bool32_t call_bool32_fs(fixed_bool32_t op1, vbool32_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 32);
@@ -97,25 +61,13 @@ fixed_bool32_t call_bool32_fs(fixed_bool32_t op1, vbool32_t op2) {
 
 // CHECK-64-LABEL: @call_bool64_fs(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 1)
-// CHECK-64-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA11]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
+// CHECK-64-NEXT:    [[TMP1:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[TMP0:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 1)
+// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
 //
 // CHECK-128-LABEL: @call_bool64_fs(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA11]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[TMP0:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
 //
 fixed_bool64_t call_bool64_fs(fixed_bool64_t op1, vbool64_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 64);
@@ -127,25 +79,13 @@ fixed_bool64_t call_bool64_fs(fixed_bool64_t op1, vbool64_t op2) {
 
 // CHECK-64-LABEL: @call_bool32_ss(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 2)
-// CHECK-64-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
 //
 // CHECK-128-LABEL: @call_bool32_ss(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
 //
 fixed_bool32_t call_bool32_ss(vbool32_t op1, vbool32_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 32);
@@ -153,25 +93,13 @@ fixed_bool32_t call_bool32_ss(vbool32_t op1, vbool32_t op2) {
 
 // CHECK-64-LABEL: @call_bool64_ss(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
 // CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 1)
-// CHECK-64-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
+// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
 //
 // CHECK-128-LABEL: @call_bool64_ss(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
 // CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
 //
 fixed_bool64_t call_bool64_ss(vbool64_t op1, vbool64_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 64);
diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
index f0fa7e8d07b4d..8407c065adb21 100644
--- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
@@ -29,46 +29,22 @@ fixed_bool8_t from_vbool8_t(vbool8_t type) {
 
 // CHECK-64-LABEL: @from_vbool16_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 4 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i1>, align 1
-// CHECK-64-NEXT:    store <vscale x 4 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10:![0-9]+]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+// CHECK-64-NEXT:    ret <vscale x 4 x i1> [[TYPE:%.*]]
 //
 // CHECK-128-LABEL: @from_vbool16_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 4 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 4 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10:![0-9]+]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+// CHECK-128-NEXT:    ret <vscale x 4 x i1> [[TYPE:%.*]]
 //
 fixed_bool16_t from_vbool16_t(vbool16_t type) {
   return type;
 }
 // CHECK-64-LABEL: @from_vbool32_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TYPE:%.*]]
 //
 // CHECK-128-LABEL: @from_vbool32_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TYPE:%.*]]
 //
 fixed_bool32_t from_vbool32_t(vbool32_t type) {
   return type;
@@ -76,11 +52,11 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) {
 
 // CHECK-64-LABEL: @to_vbool32_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP0:%.*]]
 //
 // CHECK-128-LABEL: @to_vbool32_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP0:%.*]]
 //
 vbool32_t to_vbool32_t(fixed_bool32_t type) {
   return type;
@@ -88,23 +64,11 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) {
 
 // CHECK-64-LABEL: @from_vbool64_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    store <vscale x 1 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA13:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
+// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TYPE:%.*]]
 //
 // CHECK-128-LABEL: @from_vbool64_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA13:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TYPE:%.*]]
 //
 fixed_bool64_t from_vbool64_t(vbool64_t type) {
   return type;
@@ -112,11 +76,11 @@ fixed_bool64_t from_vbool64_t(vbool64_t type) {
 
 // CHECK-64-LABEL: @to_vbool64_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP0:%.*]]
 //
 // CHECK-128-LABEL: @to_vbool64_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP0:%.*]]
 //
 vbool64_t to_vbool64_t(fixed_bool64_t type) {
   return type;
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
index 058ec49b77881..d1e586ba2b634 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
@@ -55,12 +55,11 @@ DEFINE_STRUCT(bool64)
 
 // CHECK-128-LABEL: @read_bool32(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr [[Y]], align 1, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v4i1(<vscale x 2 x i1> poison, <4 x i1> [[TMP1]], i64 0)
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
 //
 vbool32_t read_bool32(struct struct_bool32 *s) {
   return s->y[0];
@@ -68,11 +67,10 @@ vbool32_t read_bool32(struct struct_bool32 *s) {
 
 // CHECK-128-LABEL: @write_bool32(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <4 x i1> @llvm.vector.extract.v4i1.nxv2i1(<vscale x 2 x i1> [[X:%.*]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    store <8 x i1> [[TMP1]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 void write_bool32(struct struct_bool32 *s, vbool32_t x) {
@@ -81,12 +79,11 @@ void write_bool32(struct struct_bool32 *s, vbool32_t x) {
 
 // CHECK-128-LABEL: @read_bool64(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr [[Y]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 1 x i1> @llvm.vector.insert.nxv1i1.v2i1(<vscale x 1 x i1> poison, <2 x i1> [[TMP1]], i64 0)
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
 //
 vbool64_t read_bool64(struct struct_bool64 *s) {
   return s->y[0];
@@ -94,11 +91,10 @@ vbool64_t read_bool64(struct struct_bool64 *s) {
 
 // CHECK-128-LABEL: @write_bool64(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <2 x i1> @llvm.vector.extract.v2i1.nxv1i1(<vscale x 1 x i1> [[X:%.*]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    store <8 x i1> [[TMP1]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 void write_bool64(struct struct_bool64 *s, vbool64_t x) {
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
index 7992951346d54..0a50e41dda7e1 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
@@ -97,13 +97,7 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) {
 
 // CHECK-LABEL: @from_vbool32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TYPE:%.*]]
 //
 fixed_bool32_t from_vbool32_t(vbool32_t type) {
   return type;
@@ -111,7 +105,7 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) {
 
 // CHECK-LABEL: @to_vbool32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP0:%.*]]
 //
 vbool32_t to_vbool32_t(fixed_bool32_t type) {
   return type;
@@ -119,7 +113,7 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) {
 
 // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -130,7 +124,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
@@ -139,7 +133,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
 
 // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -150,7 +144,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) {
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c
index d81855aea2e5e..047bb31a1297b 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c
@@ -113,24 +113,23 @@ fixed_int16m4_t test_bool4(vbool4_t m, vint16m4_t vec) {
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 2 x i32>, align 4
 // CHECK-NEXT:    [[MASK:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-NEXT:    store <vscale x 2 x i1> [[M:%.*]], ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    store <vscale x 2 x i32> [[VEC:%.*]], ptr [[VEC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i1>, ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr @global_bool32, align 1
-// CHECK-NEXT:    store <1 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1
-// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP2]], i64 8)
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TMP3]], ptr [[MASK]], align 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 2 x i1>, ptr [[MASK]], align 1
-// CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x i32>, ptr [[VEC_ADDR]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @global_vec, align 8
-// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TMP6]], i64 0)
-// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadd.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[TMP5]], <vscale x 2 x i32> [[CAST_SCALABLE]], <vscale x 2 x i1> [[TMP4]], i64 8, i64 3)
-// CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TMP7]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to <8 x i1>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1> poison, <8 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP3]], i64 8)
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TMP4]], ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x i1>, ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 2 x i32>, ptr [[VEC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @global_vec, align 8
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TMP7]], i64 0)
+// CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadd.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[TMP6]], <vscale x 2 x i32> [[CAST_SCALABLE]], <vscale x 2 x i1> [[TMP5]], i64 8, i64 3)
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TMP8]], i64 0)
 // CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TMP8]], i64 0)
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TMP9]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE1]]
 //
 fixed_int32m1_t test_bool32(vbool32_t m, vint32m1_t vec) {
@@ -224,15 +223,15 @@ fixed_bool4_t address_of_array_idx_bool4() {
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca <1 x i8>, align 1
 // CHECK-NEXT:    [[ARR:%.*]] = alloca [3 x <1 x i8>], align 1
 // CHECK-NEXT:    [[PARR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i8>], ptr [[ARR]], i64 0, i64 0
 // CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[TMP0]], align 1
 // CHECK-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL]], align 1
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL_COERCE]], ptr align 1 [[RETVAL]], i64 1, i1 false)
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i8> [[TMP2]] to <8 x i1>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1> poison, <8 x i1> [[TMP3]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP4]]
 //
 fixed_bool32_t address_of_array_idx_bool32() {
   fixed_bool32_t arr[3];
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
index 4bd6311e05b03..dc2a10e725c64 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
@@ -89,10 +89,8 @@ void write_global_bool4(vbool4_t v) { global_bool4 = v; }
 #if __riscv_v_fixed_vlen >= 256
 // CHECK-256-LABEL: @write_global_bool32(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = tail call <8 x i1> @llvm.vector.extract.v8i1.nxv2i1(<vscale x 2 x i1> [[V:%.*]], i64 0)
+// CHECK-256-NEXT:    store <8 x i1> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool32(vbool32_t v) { global_bool32 = v; }
@@ -151,10 +149,8 @@ vbool4_t read_global_bool4() { return global_bool4; }
 #if __riscv_v_fixed_vlen >= 256
 // CHECK-256-LABEL: @read_global_bool32(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[TBAA6]]
-// CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr @global_bool32, align 1, !tbaa [[TBAA6]]
+// CHECK-256-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1> poison, <8 x i1> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
 //
 vbool32_t read_global_bool32() { return global_bool32; }