[clang] [IRGen][AArch64][RISCV] Generalize bitcast between i1 predicate vector and i8 fixed vector. (PR #76548)

Fri Feb 9 09:55:04 PST 2024

https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/76548

>From 3dfa00b0dab1820d1d8692ea91e98b29c9f8b627 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 28 Dec 2023 16:49:03 -0800
Subject: [PATCH 1/5] [IRGen][AArch64][RISCV] Generalize bitcast between i1
 predicate vector and i8 fixed vector.

Instead of only handling vscale x 16 x i1 predicate vectors, handle
any scalable i1 vector where the known minimum is divisible by 8.

This will be used on RISC-V where we have multiple sizes of predicate
types.

Though I do wonder if we have the bitcast on the wrong side. Maybe
we should be casting the fixed type to an i1 fixed vector? That would
better handle cases where the mininum elements isn't a multiple of 8,
but vscale*min_elements is.
---
 clang/lib/CodeGen/CGExprScalar.cpp | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 5502f685f64743..871714d62d91d2 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2139,14 +2139,16 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     // bitcast.
     if (const auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
       if (const auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(DstTy)) {
-        // If we are casting a fixed i8 vector to a scalable 16 x i1 predicate
+        // If we are casting a fixed i8 vector to a scalable i1 predicate
         // vector, use a vector insert and bitcast the result.
         bool NeedsBitCast = false;
-        auto PredType = llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
         llvm::Type *OrigType = DstTy;
-        if (ScalableDst == PredType &&
-            FixedSrc->getElementType() == Builder.getInt8Ty()) {
-          DstTy = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
+        if (ScalableDst->getElementType()->isIntegerTy(1) &&
+            ScalableDst->getElementCount().isKnownMultipleOf(8) &&
+            FixedSrc->getElementType()->isIntegerTy(8)) {
+          DstTy = llvm::VectorType::get(
+              FixedSrc->getElementType(),
+              ScalableDst->getElementCount().divideCoefficientBy(8));
           ScalableDst = cast<llvm::ScalableVectorType>(DstTy);
           NeedsBitCast = true;
         }
@@ -2167,12 +2169,14 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     // bitcast.
     if (const auto *ScalableSrc = dyn_cast<llvm::ScalableVectorType>(SrcTy)) {
       if (const auto *FixedDst = dyn_cast<llvm::FixedVectorType>(DstTy)) {
-        // If we are casting a scalable 16 x i1 predicate vector to a fixed i8
+        // If we are casting a scalable i1 predicate vector to a fixed i8
         // vector, bitcast the source and use a vector extract.
-        auto PredType = llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
-        if (ScalableSrc == PredType &&
-            FixedDst->getElementType() == Builder.getInt8Ty()) {
-          SrcTy = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
+        if (ScalableSrc->getElementType()->isIntegerTy(1) &&
+            ScalableSrc->getElementCount().isKnownMultipleOf(8) &&
+            FixedDst->getElementType()->isIntegerTy(8)) {
+          SrcTy = llvm::VectorType::get(
+              FixedDst->getElementType(),
+              ScalableSrc->getElementCount().divideCoefficientBy(8));
           ScalableSrc = cast<llvm::ScalableVectorType>(SrcTy);
           Src = Builder.CreateBitCast(Src, SrcTy);
         }

>From ce55e64e919ebbedc701ff5c1c76780ca38d2c16 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 2 Feb 2024 14:20:52 -0800
Subject: [PATCH 2/5] fixup! Update tests after rebase.

---
 .../attr-riscv-rvv-vector-bits-bitcast.c      | 36 ++++++--------
 .../CodeGen/attr-riscv-rvv-vector-bits-call.c | 48 ++++++++++++-------
 .../CodeGen/attr-riscv-rvv-vector-bits-cast.c | 26 +++++-----
 .../attr-riscv-rvv-vector-bits-codegen.c      | 13 +++--
 .../attr-riscv-rvv-vector-bits-globals.c      | 26 +++++-----
 5 files changed, 77 insertions(+), 72 deletions(-)

diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
index a7b3123e61cd52..20fb4a04564c75 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
@@ -177,29 +177,26 @@ void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
 
 // CHECK-64-LABEL: @read_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> undef, <8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-128-LABEL: @read_bool1(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <16 x i8>, align 16
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-128-NEXT:    store <16 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> undef, <16 x i8> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-128-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-256-LABEL: @read_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 vbool1_t read_bool1(struct struct_bool1 *s) {
@@ -208,29 +205,26 @@ vbool1_t read_bool1(struct struct_bool1 *s) {
 
 // CHECK-64-LABEL: @write_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-64-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-128-LABEL: @write_bool1(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 16
-// CHECK-128-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <16 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    store <16 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-256-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
 //
 void write_bool1(struct struct_bool1 *s, vbool1_t x) {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
index 888abe1a7bc3fb..82d320debfa4de 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
@@ -70,14 +70,23 @@ fixed_float64m1_t call_float64_ff(fixed_float64m1_t op1, fixed_float64m1_t op2)
 
 // CHECK-LABEL: @call_bool1_ff(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[OP1:%.*]] = alloca <32 x i8>, align 8
+// CHECK-NEXT:    [[OP2:%.*]] = alloca <32 x i8>, align 8
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1_COERCE:%.*]], <vscale x 64 x i1> [[OP2_COERCE:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    store <vscale x 64 x i1> [[OP1_COERCE:%.*]], ptr [[OP1]], align 8
+// CHECK-NEXT:    [[OP11:%.*]] = load <32 x i8>, ptr [[OP1]], align 8, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    store <vscale x 64 x i1> [[OP2_COERCE:%.*]], ptr [[OP2]], align 8
+// CHECK-NEXT:    [[OP22:%.*]] = load <32 x i8>, ptr [[OP2]], align 8, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[OP11]], i64 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
+// CHECK-NEXT:    [[CAST_SCALABLE3:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[OP22]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE3]] to <vscale x 64 x i1>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[TMP1]], i64 256)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 64 x i1> [[TMP2]] to <vscale x 8 x i8>
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP3]], i64 0)
+// CHECK-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP4]]
 //
 fixed_bool1_t call_bool1_ff(fixed_bool1_t op1, fixed_bool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
@@ -116,14 +125,18 @@ fixed_float64m1_t call_float64_fs(fixed_float64m1_t op1, vfloat64m1_t op2) {
 
 // CHECK-LABEL: @call_bool1_fs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[OP1:%.*]] = alloca <32 x i8>, align 8
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1_COERCE:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    store <vscale x 64 x i1> [[OP1_COERCE:%.*]], ptr [[OP1]], align 8
+// CHECK-NEXT:    [[OP11:%.*]] = load <32 x i8>, ptr [[OP1]], align 8, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[OP11]], i64 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 64 x i1> [[TMP1]] to <vscale x 8 x i8>
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP3]]
 //
 fixed_bool1_t call_bool1_fs(fixed_bool1_t op1, vbool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
@@ -162,12 +175,11 @@ fixed_float64m1_t call_float64_ss(vfloat64m1_t op1, vfloat64m1_t op2) {
 
 // CHECK-LABEL: @call_bool1_ss(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 64 x i1> [[TMP0]] to <vscale x 8 x i8>
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[RETVAL_COERCE]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
 // CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
index fe278174bf6817..063e786766e13f 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
@@ -65,11 +65,10 @@ fixed_float64m1_t from_vfloat64m1_t(vfloat64m1_t type) {
 
 // CHECK-LABEL: @from_vbool1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[TYPE:%.*]] to <vscale x 8 x i8>
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
+// CHECK-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[RETVAL_COERCE]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
 // CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
@@ -79,7 +78,12 @@ fixed_bool1_t from_vbool1_t(vbool1_t type) {
 
 // CHECK-LABEL: @to_vbool1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-NEXT:    [[TYPE:%.*]] = alloca <32 x i8>, align 8
+// CHECK-NEXT:    store <vscale x 64 x i1> [[TYPE_COERCE:%.*]], ptr [[TYPE]], align 8
+// CHECK-NEXT:    [[TYPE1:%.*]] = load <32 x i8>, ptr [[TYPE]], align 8, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TYPE1]], i64 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0]]
 //
 vbool1_t to_vbool1_t(fixed_bool1_t type) {
   return type;
@@ -105,8 +109,8 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]]
 // CHECK-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
@@ -125,7 +129,7 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) {
 
 // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -136,7 +140,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
@@ -145,7 +149,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
 
 // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -156,7 +160,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
index ac22bdce0da3e5..4292867aee5537 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
@@ -53,25 +53,24 @@ fixed_bool32_t global_bool32;
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <vscale x 64 x i1>, align 1
 // CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 64 x i8>, align 1
 // CHECK-NEXT:    [[MASK:%.*]] = alloca <vscale x 64 x i1>, align 1
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-NEXT:    store <vscale x 64 x i1> [[M:%.*]], ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    store <vscale x 64 x i8> [[VEC:%.*]], ptr [[VEC_ADDR]], align 1
 // CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 64 x i1>, ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @global_bool1, align 8
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[TMP2]], i64 256)
 // CHECK-NEXT:    store <vscale x 64 x i1> [[TMP3]], ptr [[MASK]], align 1
 // CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 64 x i1>, ptr [[MASK]], align 1
 // CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 64 x i8>, ptr [[VEC_ADDR]], align 1
 // CHECK-NEXT:    [[TMP6:%.*]] = load <256 x i8>, ptr @global_vec_int8m8, align 8
-// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP6]], i64 0)
-// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadd.mask.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[TMP5]], <vscale x 64 x i8> [[CAST_SCALABLE]], <vscale x 64 x i1> [[TMP4]], i64 256, i64 3)
+// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP6]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadd.mask.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[TMP5]], <vscale x 64 x i8> [[CAST_SCALABLE1]], <vscale x 64 x i1> [[TMP4]], i64 256, i64 3)
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <256 x i8> @llvm.vector.extract.v256i8.nxv64i8(<vscale x 64 x i8> [[TMP7]], i64 0)
 // CHECK-NEXT:    store <256 x i8> [[CAST_FIXED]], ptr [[RETVAL]], align 8
 // CHECK-NEXT:    [[TMP8:%.*]] = load <256 x i8>, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP8]], i64 0)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[CAST_SCALABLE1]]
+// CHECK-NEXT:    [[CAST_SCALABLE2:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP8]], i64 0)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[CAST_SCALABLE2]]
 //
 fixed_int8m8_t test_bool1(vbool1_t m, vint8m8_t vec) {
   vbool1_t mask = __riscv_vmand(m, global_bool1, __riscv_v_fixed_vlen);
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
index d7df1a24bbfb00..31a245dcb22405 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
@@ -56,18 +56,16 @@ void write_global_i64(vint64m1_t v) { global_i64 = v; }
 
 // CHECK-64-LABEL: @write_global_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-64-NEXT:    store <vscale x 64 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V:%.*]] to <vscale x 8 x i8>
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_global_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-256-NEXT:    store <vscale x 64 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V:%.*]] to <vscale x 8 x i8>
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool1(vbool1_t v) { global_bool1 = v; }
@@ -92,7 +90,7 @@ void write_global_bool4(vbool4_t v) { global_bool4 = v; }
 // CHECK-256-LABEL: @write_global_bool32(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA7:![0-9]+]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
@@ -120,18 +118,16 @@ vint64m1_t read_global_i64() { return global_i64; }
 
 // CHECK-64-LABEL: @read_global_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> undef, <8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-256-LABEL: @read_global_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 vbool1_t read_global_bool1() { return global_bool1; }

>From 2547fde5cb08b3fde7cd9f8297c45d850ac2d797 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 2 Feb 2024 14:58:51 -0800
Subject: [PATCH 3/5] fixup! Update CGCall.cpp too.

---
 clang/lib/CodeGen/CGCall.cpp                  | 13 ++++++------
 .../CodeGen/attr-riscv-rvv-vector-bits-call.c | 21 +++----------------
 .../CodeGen/attr-riscv-rvv-vector-bits-cast.c |  7 +------
 .../attr-riscv-rvv-vector-bits-codegen.c      |  8 +++----
 4 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 657666c9bda4ec..45c87728aafd5e 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1299,15 +1299,16 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
   // conversion.
   if (auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(Ty)) {
     if (auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
-      // If we are casting a fixed i8 vector to a scalable 16 x i1 predicate
+      // If we are casting a fixed i8 vector to a scalable i1 predicate
       // vector, use a vector insert and bitcast the result.
       bool NeedsBitcast = false;
-      auto PredType =
-          llvm::ScalableVectorType::get(CGF.Builder.getInt1Ty(), 16);
       llvm::Type *OrigType = Ty;
-      if (ScalableDst == PredType &&
-          FixedSrc->getElementType() == CGF.Builder.getInt8Ty()) {
-        ScalableDst = llvm::ScalableVectorType::get(CGF.Builder.getInt8Ty(), 2);
+      if (ScalableDst->getElementType()->isIntegerTy(1) &&
+          ScalableDst->getElementCount().isKnownMultipleOf(8) &&
+          FixedSrc->getElementType()->isIntegerTy(8)) {
+        ScalableDst = llvm::ScalableVectorType::get(
+            FixedSrc->getElementType(),
+            ScalableDst->getElementCount().getKnownMinValue() / 8);
         NeedsBitcast = true;
       }
       if (ScalableDst->getElementType() == FixedSrc->getElementType()) {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
index 82d320debfa4de..c91049ba0798ac 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
@@ -72,7 +72,6 @@ fixed_float64m1_t call_float64_ff(fixed_float64m1_t op1, fixed_float64m1_t op2)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OP1:%.*]] = alloca <32 x i8>, align 8
 // CHECK-NEXT:    [[OP2:%.*]] = alloca <32 x i8>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    store <vscale x 64 x i1> [[OP1_COERCE:%.*]], ptr [[OP1]], align 8
 // CHECK-NEXT:    [[OP11:%.*]] = load <32 x i8>, ptr [[OP1]], align 8, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-NEXT:    store <vscale x 64 x i1> [[OP2_COERCE:%.*]], ptr [[OP2]], align 8
@@ -82,11 +81,7 @@ fixed_float64m1_t call_float64_ff(fixed_float64m1_t op1, fixed_float64m1_t op2)
 // CHECK-NEXT:    [[CAST_SCALABLE3:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[OP22]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE3]] to <vscale x 64 x i1>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[TMP1]], i64 256)
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 64 x i1> [[TMP2]] to <vscale x 8 x i8>
-// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP3]], i64 0)
-// CHECK-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP4]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
 //
 fixed_bool1_t call_bool1_ff(fixed_bool1_t op1, fixed_bool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
@@ -126,17 +121,12 @@ fixed_float64m1_t call_float64_fs(fixed_float64m1_t op1, vfloat64m1_t op2) {
 // CHECK-LABEL: @call_bool1_fs(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OP1:%.*]] = alloca <32 x i8>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    store <vscale x 64 x i1> [[OP1_COERCE:%.*]], ptr [[OP1]], align 8
 // CHECK-NEXT:    [[OP11:%.*]] = load <32 x i8>, ptr [[OP1]], align 8, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[OP11]], i64 0)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 64 x i1> [[TMP1]] to <vscale x 8 x i8>
-// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP3]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 fixed_bool1_t call_bool1_fs(fixed_bool1_t op1, vbool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
@@ -175,13 +165,8 @@ fixed_float64m1_t call_float64_ss(vfloat64m1_t op1, vfloat64m1_t op2) {
 
 // CHECK-LABEL: @call_bool1_ss(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 64 x i1> [[TMP0]] to <vscale x 8 x i8>
-// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0]]
 //
 fixed_bool1_t call_bool1_ss(vbool1_t op1, vbool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
index 063e786766e13f..f2b90b1a5c6ca2 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
@@ -65,12 +65,7 @@ fixed_float64m1_t from_vfloat64m1_t(vfloat64m1_t type) {
 
 // CHECK-LABEL: @from_vbool1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[TYPE:%.*]] to <vscale x 8 x i8>
-// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
-// CHECK-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE:%.*]]
 //
 fixed_bool1_t from_vbool1_t(vbool1_t type) {
   return type;
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
index 4292867aee5537..eb769fadda9a85 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
@@ -180,15 +180,15 @@ fixed_int32m1_t array_arg(fixed_int32m1_t arr[]) {
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca <32 x i8>, align 8
 // CHECK-NEXT:    [[ARR:%.*]] = alloca [3 x <32 x i8>], align 8
 // CHECK-NEXT:    [[PARR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <32 x i8>], ptr [[ARR]], i64 0, i64 0
 // CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 8
 // CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 8 [[RETVAL]], i64 32, i1 false)
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP3]]
 //
 fixed_bool1_t address_of_array_idx_bool1() {
   fixed_bool1_t arr[3];

>From 23e1d0dc4b1f3b6cfa2010e8d2afe4c6fe4ff160 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 9 Feb 2024 09:16:48 -0800
Subject: [PATCH 4/5] fixup! Fix another piece of code I missed.

---
 clang/lib/CodeGen/CGCall.cpp                  | 11 ++++++-----
 .../CodeGen/attr-riscv-rvv-vector-bits-call.c | 19 ++-----------------
 .../CodeGen/attr-riscv-rvv-vector-bits-cast.c | 19 +++++++------------
 3 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 45c87728aafd5e..5088118bd21cde 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3196,13 +3196,14 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
         llvm::Value *Coerced = Fn->getArg(FirstIRArg);
         if (auto *VecTyFrom =
                 dyn_cast<llvm::ScalableVectorType>(Coerced->getType())) {
-          // If we are casting a scalable 16 x i1 predicate vector to a fixed i8
+          // If we are casting a scalable i1 predicate vector to a fixed i8
           // vector, bitcast the source and use a vector extract.
-          auto PredType =
-              llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
-          if (VecTyFrom == PredType &&
+          if (VecTyFrom->getElementType()->isIntegerTy(1) &&
+              VecTyFrom->getElementCount().isKnownMultipleOf(8) &&
               VecTyTo->getElementType() == Builder.getInt8Ty()) {
-            VecTyFrom = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
+            VecTyFrom = llvm::ScalableVectorType::get(
+                VecTyTo->getElementType(),
+                VecTyFrom->getElementCount().getKnownMinValue() / 8);
             Coerced = Builder.CreateBitCast(Coerced, VecTyFrom);
           }
           if (VecTyFrom->getElementType() == VecTyTo->getElementType()) {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
index c91049ba0798ac..1824d97d04dda8 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
@@ -70,17 +70,7 @@ fixed_float64m1_t call_float64_ff(fixed_float64m1_t op1, fixed_float64m1_t op2)
 
 // CHECK-LABEL: @call_bool1_ff(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[OP1:%.*]] = alloca <32 x i8>, align 8
-// CHECK-NEXT:    [[OP2:%.*]] = alloca <32 x i8>, align 8
-// CHECK-NEXT:    store <vscale x 64 x i1> [[OP1_COERCE:%.*]], ptr [[OP1]], align 8
-// CHECK-NEXT:    [[OP11:%.*]] = load <32 x i8>, ptr [[OP1]], align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    store <vscale x 64 x i1> [[OP2_COERCE:%.*]], ptr [[OP2]], align 8
-// CHECK-NEXT:    [[OP22:%.*]] = load <32 x i8>, ptr [[OP2]], align 8, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[OP11]], i64 0)
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
-// CHECK-NEXT:    [[CAST_SCALABLE3:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[OP22]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE3]] to <vscale x 64 x i1>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[TMP1]], i64 256)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0:%.*]], <vscale x 64 x i1> [[TMP1:%.*]], i64 256)
 // CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
 //
 fixed_bool1_t call_bool1_ff(fixed_bool1_t op1, fixed_bool1_t op2) {
@@ -120,12 +110,7 @@ fixed_float64m1_t call_float64_fs(fixed_float64m1_t op1, vfloat64m1_t op2) {
 
 // CHECK-LABEL: @call_bool1_fs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[OP1:%.*]] = alloca <32 x i8>, align 8
-// CHECK-NEXT:    store <vscale x 64 x i1> [[OP1_COERCE:%.*]], ptr [[OP1]], align 8
-// CHECK-NEXT:    [[OP11:%.*]] = load <32 x i8>, ptr [[OP1]], align 8, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[OP11]], i64 0)
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
 // CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 fixed_bool1_t call_bool1_fs(fixed_bool1_t op1, vbool1_t op2) {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
index f2b90b1a5c6ca2..3806c3e1b30bbf 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
@@ -73,12 +73,7 @@ fixed_bool1_t from_vbool1_t(vbool1_t type) {
 
 // CHECK-LABEL: @to_vbool1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = alloca <32 x i8>, align 8
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TYPE_COERCE:%.*]], ptr [[TYPE]], align 8
-// CHECK-NEXT:    [[TYPE1:%.*]] = load <32 x i8>, ptr [[TYPE]], align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TYPE1]], i64 0)
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0:%.*]]
 //
 vbool1_t to_vbool1_t(fixed_bool1_t type) {
   return type;
@@ -104,8 +99,8 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
@@ -124,7 +119,7 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) {
 
 // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -135,7 +130,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
@@ -144,7 +139,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
 
 // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -155,7 +150,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) {

>From 97f7db1ea4432297a6d1c5ad77c3f9d441789e73 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 9 Feb 2024 09:50:38 -0800
Subject: [PATCH 5/5] fixup! Remove NeedsBitcast. Add 'Ty' to end of Type
 variable names.

---
 clang/lib/CodeGen/CGCall.cpp       | 38 +++++++++++-----------
 clang/lib/CodeGen/CGExprScalar.cpp | 51 ++++++++++++++----------------
 2 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 5088118bd21cde..4d4f1b6678d01d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1297,28 +1297,25 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
   // If coercing a fixed vector to a scalable vector for ABI compatibility, and
   // the types match, use the llvm.vector.insert intrinsic to perform the
   // conversion.
-  if (auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(Ty)) {
-    if (auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
+  if (auto *ScalableDstTy = dyn_cast<llvm::ScalableVectorType>(Ty)) {
+    if (auto *FixedSrcTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
       // If we are casting a fixed i8 vector to a scalable i1 predicate
       // vector, use a vector insert and bitcast the result.
-      bool NeedsBitcast = false;
-      llvm::Type *OrigType = Ty;
-      if (ScalableDst->getElementType()->isIntegerTy(1) &&
-          ScalableDst->getElementCount().isKnownMultipleOf(8) &&
-          FixedSrc->getElementType()->isIntegerTy(8)) {
-        ScalableDst = llvm::ScalableVectorType::get(
-            FixedSrc->getElementType(),
-            ScalableDst->getElementCount().getKnownMinValue() / 8);
-        NeedsBitcast = true;
+      if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
+          ScalableDstTy->getElementCount().isKnownMultipleOf(8) &&
+          FixedSrcTy->getElementType()->isIntegerTy(8)) {
+        ScalableDstTy = llvm::ScalableVectorType::get(
+            FixedSrcTy->getElementType(),
+            ScalableDstTy->getElementCount().getKnownMinValue() / 8);
       }
-      if (ScalableDst->getElementType() == FixedSrc->getElementType()) {
+      if (ScalableDstTy->getElementType() == FixedSrcTy->getElementType()) {
         auto *Load = CGF.Builder.CreateLoad(Src);
-        auto *UndefVec = llvm::UndefValue::get(ScalableDst);
+        auto *UndefVec = llvm::UndefValue::get(ScalableDstTy);
         auto *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
         llvm::Value *Result = CGF.Builder.CreateInsertVector(
-            ScalableDst, UndefVec, Load, Zero, "cast.scalable");
-        if (NeedsBitcast)
-          Result = CGF.Builder.CreateBitCast(Result, OrigType);
+            ScalableDstTy, UndefVec, Load, Zero, "cast.scalable");
+        if (ScalableDstTy != Ty)
+          Result = CGF.Builder.CreateBitCast(Result, Ty);
         return Result;
       }
     }
@@ -5841,12 +5838,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       // If coercing a fixed vector from a scalable vector for ABI
       // compatibility, and the types match, use the llvm.vector.extract
       // intrinsic to perform the conversion.
-      if (auto *FixedDst = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
+      if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
         llvm::Value *V = CI;
-        if (auto *ScalableSrc = dyn_cast<llvm::ScalableVectorType>(V->getType())) {
-          if (FixedDst->getElementType() == ScalableSrc->getElementType()) {
+        if (auto *ScalableSrcTy =
+                dyn_cast<llvm::ScalableVectorType>(V->getType())) {
+          if (FixedDstTy->getElementType() == ScalableSrcTy->getElementType()) {
             llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);
-            V = Builder.CreateExtractVector(FixedDst, V, Zero, "cast.fixed");
+            V = Builder.CreateExtractVector(FixedDstTy, V, Zero, "cast.fixed");
             return RValue::get(V);
           }
         }
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 871714d62d91d2..d4d9bd6109dc0f 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2137,28 +2137,24 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     // If Src is a fixed vector and Dst is a scalable vector, and both have the
     // same element type, use the llvm.vector.insert intrinsic to perform the
     // bitcast.
-    if (const auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
-      if (const auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(DstTy)) {
+    if (auto *FixedSrcTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
+      if (auto *ScalableDstTy = dyn_cast<llvm::ScalableVectorType>(DstTy)) {
         // If we are casting a fixed i8 vector to a scalable i1 predicate
         // vector, use a vector insert and bitcast the result.
-        bool NeedsBitCast = false;
-        llvm::Type *OrigType = DstTy;
-        if (ScalableDst->getElementType()->isIntegerTy(1) &&
-            ScalableDst->getElementCount().isKnownMultipleOf(8) &&
-            FixedSrc->getElementType()->isIntegerTy(8)) {
-          DstTy = llvm::VectorType::get(
-              FixedSrc->getElementType(),
-              ScalableDst->getElementCount().divideCoefficientBy(8));
-          ScalableDst = cast<llvm::ScalableVectorType>(DstTy);
-          NeedsBitCast = true;
+        if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
+            ScalableDstTy->getElementCount().isKnownMultipleOf(8) &&
+            FixedSrcTy->getElementType()->isIntegerTy(8)) {
+          ScalableDstTy = llvm::ScalableVectorType::get(
+              FixedSrcTy->getElementType(),
+              ScalableDstTy->getElementCount().getKnownMinValue() / 8);
         }
-        if (FixedSrc->getElementType() == ScalableDst->getElementType()) {
-          llvm::Value *UndefVec = llvm::UndefValue::get(DstTy);
+        if (FixedSrcTy->getElementType() == ScalableDstTy->getElementType()) {
+          llvm::Value *UndefVec = llvm::UndefValue::get(ScalableDstTy);
           llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
           llvm::Value *Result = Builder.CreateInsertVector(
-              DstTy, UndefVec, Src, Zero, "cast.scalable");
-          if (NeedsBitCast)
-            Result = Builder.CreateBitCast(Result, OrigType);
+              ScalableDstTy, UndefVec, Src, Zero, "cast.scalable");
+          if (Result->getType() != DstTy)
+            Result = Builder.CreateBitCast(Result, DstTy);
           return Result;
         }
       }
@@ -2167,20 +2163,19 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     // If Src is a scalable vector and Dst is a fixed vector, and both have the
     // same element type, use the llvm.vector.extract intrinsic to perform the
     // bitcast.
-    if (const auto *ScalableSrc = dyn_cast<llvm::ScalableVectorType>(SrcTy)) {
-      if (const auto *FixedDst = dyn_cast<llvm::FixedVectorType>(DstTy)) {
+    if (auto *ScalableSrcTy = dyn_cast<llvm::ScalableVectorType>(SrcTy)) {
+      if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(DstTy)) {
         // If we are casting a scalable i1 predicate vector to a fixed i8
         // vector, bitcast the source and use a vector extract.
-        if (ScalableSrc->getElementType()->isIntegerTy(1) &&
-            ScalableSrc->getElementCount().isKnownMultipleOf(8) &&
-            FixedDst->getElementType()->isIntegerTy(8)) {
-          SrcTy = llvm::VectorType::get(
-              FixedDst->getElementType(),
-              ScalableSrc->getElementCount().divideCoefficientBy(8));
-          ScalableSrc = cast<llvm::ScalableVectorType>(SrcTy);
-          Src = Builder.CreateBitCast(Src, SrcTy);
+        if (ScalableSrcTy->getElementType()->isIntegerTy(1) &&
+            ScalableSrcTy->getElementCount().isKnownMultipleOf(8) &&
+            FixedDstTy->getElementType()->isIntegerTy(8)) {
+          ScalableSrcTy = llvm::ScalableVectorType::get(
+              FixedDstTy->getElementType(),
+              ScalableSrcTy->getElementCount().getKnownMinValue() / 8);
+          Src = Builder.CreateBitCast(Src, ScalableSrcTy);
         }
-        if (ScalableSrc->getElementType() == FixedDst->getElementType()) {
+        if (ScalableSrcTy->getElementType() == FixedDstTy->getElementType()) {
           llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
           return Builder.CreateExtractVector(DstTy, Src, Zero, "cast.fixed");
         }