[clang] [clang][RISCV] Fix crash on VLS calling convention (PR #145489)

Sun Jul 6 15:28:32 PDT 2025

https://github.com/4vtomat updated https://github.com/llvm/llvm-project/pull/145489

>From 8b90033dd07ec2256f26670a43a0f713cc7b84ef Mon Sep 17 00:00:00 2001
From: Brandon Wu <songwu0813 at gmail.com>
Date: Sun, 6 Jul 2025 00:42:02 -0700
Subject: [PATCH] [clang][RISCV] Fix crash on VLS calling convention

This patch handle struct of fixed vector and struct of array of fixed
vector correctly for VLS calling convention in EmitFunctionProlog,
EmitFunctionEpilog and EmitCall.
---
 clang/lib/CodeGen/CGCall.cpp                  | 167 ++++++++++++++++--
 .../RISCV/riscv-vector-callingconv-llvm-ir.c  |  93 ++++++++--
 .../riscv-vector-callingconv-llvm-ir.cpp      |  32 ++--
 3 files changed, 246 insertions(+), 46 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index c8c3d6b20c496..bc7698e648027 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <optional>
@@ -1329,9 +1330,77 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
   llvm::TypeSize DstSize = CGF.CGM.getDataLayout().getTypeAllocSize(Ty);
 
   if (llvm::StructType *SrcSTy = dyn_cast<llvm::StructType>(SrcTy)) {
-    Src = EnterStructPointerForCoercedAccess(Src, SrcSTy,
-                                             DstSize.getFixedValue(), CGF);
-    SrcTy = Src.getElementType();
+    if (llvm::TargetExtType *TupTy = dyn_cast<llvm::TargetExtType>(Ty)) {
+      // In RISC-V VLS calling convention, struct of fixed vectors or struct of
+      // array of fixed vector of length >1 might be lowered using vector tuple
+      // type, we consider it as a valid load, e.g.
+      // struct i32x4x2 {
+      //     __attribute__((vector_size(16))) int i;
+      //     __attribute__((vector_size(16))) int i;
+      // };
+      // or
+      // struct i32x4 {
+      //     __attribute__((vector_size(16))) int i[2];
+      // };
+      // is lowered to target("riscv.vector.tuple", <vscale x 8 x i8>, 2)
+      // when ABI_VLEN = 128 bits, please checkout
+      // clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
+      // for more information.
+      assert(TupTy->getName() == "riscv.vector.tuple");
+      llvm::Type *EltTy = TupTy->getTypeParameter(0);
+      unsigned NumElts = TupTy->getIntParameter(0);
+
+      if (auto *ArrayTy = dyn_cast<llvm::ArrayType>(SrcSTy->getElementType(0)))
+        Src = Src.withElementType(ArrayTy);
+
+      // Perform extract element and load
+      llvm::Value *PoisonTuple = llvm::PoisonValue::get(Ty);
+      auto *Load = CGF.Builder.CreateLoad(Src);
+      for (unsigned i = 0; i < NumElts; ++i) {
+        // Extract from struct
+        llvm::Value *ExtractFromLoad = CGF.Builder.CreateExtractValue(Load, i);
+        // Element in vector tuple type is always i8, so we need to cast back to
+        // it's original element type.
+        EltTy = cast<llvm::ScalableVectorType>(
+            llvm::VectorType::getWithSizeAndScalar(
+                cast<llvm::VectorType>(EltTy), ExtractFromLoad->getType()));
+        llvm::Value *PoisonVec = llvm::PoisonValue::get(EltTy);
+        // Insert to scalable vector
+        PoisonVec = CGF.Builder.CreateInsertVector(
+            EltTy, PoisonVec, ExtractFromLoad, uint64_t(0), "cast.scalable");
+        // Insert scalable vector to vector tuple
+        llvm::Value *Idx = llvm::ConstantInt::get(CGF.Builder.getInt32Ty(), i);
+        PoisonTuple = CGF.Builder.CreateIntrinsic(
+            llvm::Intrinsic::riscv_tuple_insert, {Ty, EltTy},
+            {PoisonTuple, PoisonVec, Idx});
+      }
+      return PoisonTuple;
+    }
+
+    if (Ty->isScalableTy()) {
+      // In RISC-V VLS calling convention, struct of fixed vector or struct of
+      // fixed vector array of length 1 might be lowered using scalable vector,
+      // we consider it as a valid load, e.g.
+      // struct i32x4 {
+      //     __attribute__((vector_size(16))) int i;
+      // };
+      // or
+      // struct i32x4 {
+      //     __attribute__((vector_size(16))) int i[1];
+      // };
+      // is lowered to <vscale x 2 x i32>
+      // when ABI_VLEN = 128 bits, please checkout
+      // clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
+      // for more information.
+      SrcTy = SrcSTy->getElementType(0);
+      if (auto *ArrayTy = dyn_cast<llvm::ArrayType>(SrcTy))
+        SrcTy = ArrayTy->getElementType();
+      Src = Src.withElementType(SrcTy);
+    } else {
+      Src = EnterStructPointerForCoercedAccess(Src, SrcSTy,
+                                               DstSize.getFixedValue(), CGF);
+      SrcTy = Src.getElementType();
+    }
   }
 
   llvm::TypeSize SrcSize = CGF.CGM.getDataLayout().getTypeAllocSize(SrcTy);
@@ -1412,6 +1481,87 @@ void CodeGenFunction::CreateCoercedStore(llvm::Value *Src, Address Dst,
   if (SrcTy != Dst.getElementType()) {
     if (llvm::StructType *DstSTy =
             dyn_cast<llvm::StructType>(Dst.getElementType())) {
+      if (llvm::TargetExtType *TupTy = dyn_cast<llvm::TargetExtType>(SrcTy)) {
+        // In RISC-V VLS calling convention, struct of fixed vectors or struct of
+        // array of fixed vector of length >1 might be lowered using vector tuple
+        // type, we consider it as a valid load, e.g.
+        // struct i32x4x2 {
+        //     __attribute__((vector_size(16))) int i;
+        //     __attribute__((vector_size(16))) int i;
+        // };
+        // or
+        // struct i32x4 {
+        //     __attribute__((vector_size(16))) int i[2];
+        // };
+        // is lowered to target("riscv.vector.tuple", <vscale x 8 x i8>, 2)
+        // when ABI_VLEN = 128 bits, please checkout
+        // clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
+        // for more information.
+        assert(TupTy->getName() == "riscv.vector.tuple");
+        llvm::Type *EltTy = TupTy->getTypeParameter(0);
+        unsigned NumElts = TupTy->getIntParameter(0);
+
+        llvm::Type *FixedVecTy = DstSTy->getElementType(0);
+        if (auto *ArrayTy =
+                dyn_cast<llvm::ArrayType>(DstSTy->getElementType(0))) {
+          Dst = Dst.withElementType(ArrayTy);
+          FixedVecTy = ArrayTy->getArrayElementType();
+        }
+
+        // Perform extract element and store
+        for (unsigned i = 0; i < NumElts; ++i) {
+          // Element in vector tuple type is always i8, so we need to cast back
+          // to it's original element type.
+          EltTy = cast<llvm::ScalableVectorType>(
+              llvm::VectorType::getWithSizeAndScalar(
+                  cast<llvm::VectorType>(EltTy), FixedVecTy));
+          // Extract scalable vector from tuple
+          llvm::Value *Idx = llvm::ConstantInt::get(Builder.getInt32Ty(), i);
+          auto *TupleElement = Builder.CreateIntrinsic(
+              llvm::Intrinsic::riscv_tuple_extract, {EltTy, TupTy}, {Src, Idx});
+
+          // Extract fixed vector from scalable vector
+          auto *ExtractVec = Builder.CreateExtractVector(
+              FixedVecTy, TupleElement, uint64_t(0));
+          // Store fixed vector to corresponding address
+          Address EltPtr = Address::invalid();
+          if (Dst.getElementType()->isStructTy())
+            EltPtr = Builder.CreateStructGEP(Dst, i);
+          else
+            EltPtr = Builder.CreateConstArrayGEP(Dst, i);
+          auto *I = Builder.CreateStore(ExtractVec, EltPtr, DstIsVolatile);
+          addInstToCurrentSourceAtom(I, ExtractVec);
+        }
+        return;
+      }
+
+      if (SrcTy->isScalableTy()) {
+        // In RISC-V VLS calling convention, struct of fixed vector or struct of
+        // fixed vector array of length 1 might be lowered using scalable vector,
+        // we consider it as a valid load, e.g.
+        // struct i32x4 {
+        //     __attribute__((vector_size(16))) int i;
+        // };
+        // or
+        // struct i32x4 {
+        //     __attribute__((vector_size(16))) int i[1];
+        // };
+        // is lowered to <vscale x 2 x i32>
+        // when ABI_VLEN = 128 bits, please checkout
+        // clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
+        // for more information.
+        llvm::Type *EltTy = DstSTy->getElementType(0);
+        if (auto *ArrayTy = dyn_cast<llvm::ArrayType>(EltTy)) {
+          assert(ArrayTy->getNumElements() == 1);
+          EltTy = ArrayTy->getElementType();
+        }
+        auto *Coerced = Builder.CreateExtractVector(
+            cast<llvm::FixedVectorType>(EltTy), Src, uint64_t(0));
+        auto *I = Builder.CreateStore(Coerced, Dst, DstIsVolatile);
+        addInstToCurrentSourceAtom(I, Src);
+        return;
+      }
+
       assert(!SrcSize.isScalable());
       Dst = EnterStructPointerForCoercedAccess(Dst, DstSTy,
                                                SrcSize.getFixedValue(), *this);
@@ -3335,17 +3485,6 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
         }
       }
 
-      // Struct of fixed-length vectors and struct of array of fixed-length
-      // vector in VLS calling convention are coerced to vector tuple
-      // type(represented as TargetExtType) and scalable vector type
-      // respectively, they're no longer handled as struct.
-      if (ArgI.isDirect() && isa<llvm::StructType>(ConvertType(Ty)) &&
-          (isa<llvm::TargetExtType>(ArgI.getCoerceToType()) ||
-           isa<llvm::ScalableVectorType>(ArgI.getCoerceToType()))) {
-        ArgVals.push_back(ParamValue::forDirect(AI));
-        break;
-      }
-
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgI.getCoerceToType());
       Address Alloca =
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
index 3044d91f1c31c..83253a92ec789 100644
--- a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
@@ -143,34 +143,34 @@ void __attribute__((riscv_vls_cc)) test_too_large(int32x64_t arg) {}
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_too_large_256(<vscale x 16 x i32> noundef %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_too_large_256(int32x64_t arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4(<vscale x 2 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4(<vscale x 2 x i32> %arg.coerce)
 void __attribute__((riscv_vls_cc)) test_st_i32x4(struct st_i32x4 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4_256(<vscale x 1 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4_256(<vscale x 1 x i32> %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4_256(struct st_i32x4 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4_arr1(<vscale x 2 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4_arr1(<vscale x 2 x i32> %arg.coerce)
 void __attribute__((riscv_vls_cc)) test_st_i32x4_arr1(struct st_i32x4_arr1 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4_arr1_256(<vscale x 1 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4_arr1_256(<vscale x 1 x i32> %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4_arr1_256(struct st_i32x4_arr1 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4_arr4(<vscale x 8 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4_arr4(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %arg.coerce)
 void __attribute__((riscv_vls_cc)) test_st_i32x4_arr4(struct st_i32x4_arr4 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4_arr4_256(<vscale x 4 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4_arr4_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4_arr4_256(struct st_i32x4_arr4 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4_arr8(<vscale x 16 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4_arr8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %arg.coerce)
 void __attribute__((riscv_vls_cc)) test_st_i32x4_arr8(struct st_i32x4_arr8 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4_arr8_256(<vscale x 8 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4_arr8_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4_arr8_256(struct st_i32x4_arr8 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x2(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x2(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg.coerce)
 void __attribute__((riscv_vls_cc)) test_st_i32x4x2(struct st_i32x4x2 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x2_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x2_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4x2_256(struct st_i32x4x2 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x8x2(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x8x2(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %arg.coerce)
 void __attribute__((riscv_vls_cc)) test_st_i32x8x2(struct st_i32x8x2 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x8x2_256(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x8x2_256(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x8x2_256(struct st_i32x8x2 arg) {}
 
 // CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x64x2(ptr noundef %arg)
@@ -178,17 +178,78 @@ void __attribute__((riscv_vls_cc)) test_st_i32x64x2(struct st_i32x64x2 arg) {}
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x64x2_256(ptr noundef %arg)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x64x2_256(struct st_i32x64x2 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x3(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x3(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %arg.coerce)
 void __attribute__((riscv_vls_cc)) test_st_i32x4x3(struct st_i32x4x3 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x3_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x3_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4x3_256(struct st_i32x4x3 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %arg.coerce)
 void __attribute__((riscv_vls_cc)) test_st_i32x4x8(struct st_i32x4x8 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x8_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x8_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4x8_256(struct st_i32x4x8 arg) {}
 
 // CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x9(ptr noundef %arg)
 void __attribute__((riscv_vls_cc)) test_st_i32x4x9(struct st_i32x4x9 arg) {}
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x9_256(ptr noundef %arg)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4x9_256(struct st_i32x4x9 arg) {}
+
+// CHECK-LLVM-LABEL: define dso_local riscv_vls_cc(128) target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_function_prolog_epilog(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %arg.coerce) #0 {
+// CHECK-LLVM-NEXT: entry:
+// CHECK-LLVM-NEXT:   %retval = alloca %struct.st_i32x4_arr4, align 16
+// CHECK-LLVM-NEXT:   %arg = alloca %struct.st_i32x4_arr4, align 16
+// CHECK-LLVM-NEXT:   %0 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %arg.coerce, i32 0)
+// CHECK-LLVM-NEXT:   %1 = call <4 x i32> @llvm.vector.extract.v4i32.nxv2i32(<vscale x 2 x i32> %0, i64 0)
+// CHECK-LLVM-NEXT:   %2 = getelementptr inbounds [4 x <4 x i32>], ptr %arg, i64 0, i64 0
+// CHECK-LLVM-NEXT:   store <4 x i32> %1, ptr %2, align 16
+// CHECK-LLVM-NEXT:   %3 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %arg.coerce, i32 1)
+// CHECK-LLVM-NEXT:   %4 = call <4 x i32> @llvm.vector.extract.v4i32.nxv2i32(<vscale x 2 x i32> %3, i64 0)
+// CHECK-LLVM-NEXT:   %5 = getelementptr inbounds [4 x <4 x i32>], ptr %arg, i64 0, i64 1
+// CHECK-LLVM-NEXT:   store <4 x i32> %4, ptr %5, align 16
+// CHECK-LLVM-NEXT:   %6 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %arg.coerce, i32 2)
+// CHECK-LLVM-NEXT:   %7 = call <4 x i32> @llvm.vector.extract.v4i32.nxv2i32(<vscale x 2 x i32> %6, i64 0)
+// CHECK-LLVM-NEXT:   %8 = getelementptr inbounds [4 x <4 x i32>], ptr %arg, i64 0, i64 2
+// CHECK-LLVM-NEXT:   store <4 x i32> %7, ptr %8, align 16
+// CHECK-LLVM-NEXT:   %9 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %arg.coerce, i32 3)
+// CHECK-LLVM-NEXT:   %10 = call <4 x i32> @llvm.vector.extract.v4i32.nxv2i32(<vscale x 2 x i32> %9, i64 0)
+// CHECK-LLVM-NEXT:   %11 = getelementptr inbounds [4 x <4 x i32>], ptr %arg, i64 0, i64 3
+// CHECK-LLVM-NEXT:   store <4 x i32> %10, ptr %11, align 16
+// CHECK-LLVM-NEXT:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval, ptr align 16 %arg, i64 64, i1 false)
+// CHECK-LLVM-NEXT:   %12 = load [4 x <4 x i32>], ptr %retval, align 16
+// CHECK-LLVM-NEXT:   %13 = extractvalue [4 x <4 x i32>] %12, 0
+// CHECK-LLVM-NEXT:   %cast.scalable = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> poison, <4 x i32> %13, i64 0)
+// CHECK-LLVM-NEXT:   %14 = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 2 x i32> %cast.scalable, i32 0)
+// CHECK-LLVM-NEXT:   %15 = extractvalue [4 x <4 x i32>] %12, 1
+// CHECK-LLVM-NEXT:   %cast.scalable1 = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> poison, <4 x i32> %15, i64 0)
+// CHECK-LLVM-NEXT:   %16 = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %14, <vscale x 2 x i32> %cast.scalable1, i32 1)
+// CHECK-LLVM-NEXT:   %17 = extractvalue [4 x <4 x i32>] %12, 2
+// CHECK-LLVM-NEXT:   %cast.scalable2 = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> poison, <4 x i32> %17, i64 0)
+// CHECK-LLVM-NEXT:   %18 = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %16, <vscale x 2 x i32> %cast.scalable2, i32 2)
+// CHECK-LLVM-NEXT:   %19 = extractvalue [4 x <4 x i32>] %12, 3
+// CHECK-LLVM-NEXT:   %cast.scalable3 = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> poison, <4 x i32> %19, i64 0)
+// CHECK-LLVM-NEXT:   %20 = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %18, <vscale x 2 x i32> %cast.scalable3, i32 3)
+// CHECK-LLVM-NEXT:   ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %20
+// CHECK-LLVM-NEXT: }
+struct st_i32x4_arr4 __attribute__((riscv_vls_cc)) test_function_prolog_epilog(struct st_i32x4_arr4 arg) {
+  return arg;
+}
+
+struct st_i32x4 __attribute__((riscv_vls_cc)) dummy(struct st_i32x4);
+// CHECK-LLVM-LABEL: define dso_local riscv_vls_cc(128) <vscale x 2 x i32> @test_call(<vscale x 2 x i32> %arg.coerce) #0 {
+// CHECK-LLVM-NEXT: entry:
+// CHECK-LLVM-NEXT:   %retval = alloca %struct.st_i32x4, align 16
+// CHECK-LLVM-NEXT:   %arg = alloca %struct.st_i32x4, align 16
+// CHECK-LLVM-NEXT:   %0 = call <4 x i32> @llvm.vector.extract.v4i32.nxv2i32(<vscale x 2 x i32> %arg.coerce, i64 0)
+// CHECK-LLVM-NEXT:   store <4 x i32> %0, ptr %arg, align 16
+// CHECK-LLVM-NEXT:   %1 = load <4 x i32>, ptr %arg, align 16
+// CHECK-LLVM-NEXT:   %cast.scalable = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> poison, <4 x i32> %1, i64 0)
+// CHECK-LLVM-NEXT:   %call = call riscv_vls_cc(128) <vscale x 2 x i32> @dummy(<vscale x 2 x i32> %cast.scalable)
+// CHECK-LLVM-NEXT:   %2 = call <4 x i32> @llvm.vector.extract.v4i32.nxv2i32(<vscale x 2 x i32> %call, i64 0)
+// CHECK-LLVM-NEXT:   store <4 x i32> %2, ptr %retval, align 16
+// CHECK-LLVM-NEXT:   %3 = load <4 x i32>, ptr %retval, align 16
+// CHECK-LLVM-NEXT:   %cast.scalable1 = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> poison, <4 x i32> %3, i64 0)
+// CHECK-LLVM-NEXT:   ret <vscale x 2 x i32> %cast.scalable1
+// CHECK-LLVM-NEXT: }
+struct st_i32x4 __attribute__((riscv_vls_cc)) test_call(struct st_i32x4 arg) {
+  struct st_i32x4 abc = dummy(arg);
+  return abc;
+}
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
index 594bfe159b28c..876fc1aebcec1 100644
--- a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
@@ -123,34 +123,34 @@ typedef int __attribute__((vector_size(256))) int32x64_t;
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z18test_too_large_256Dv64_i(<vscale x 16 x i32> noundef %arg.coerce)
 [[riscv::vls_cc(256)]] void test_too_large_256(int32x64_t arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z13test_st_i32x48st_i32x4(<vscale x 2 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z13test_st_i32x48st_i32x4(<vscale x 2 x i32> %arg.coerce)
 [[riscv::vls_cc]] void test_st_i32x4(struct st_i32x4 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z17test_st_i32x4_2568st_i32x4(<vscale x 1 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z17test_st_i32x4_2568st_i32x4(<vscale x 1 x i32> %arg.coerce)
 [[riscv::vls_cc(256)]] void test_st_i32x4_256(struct st_i32x4 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z18test_st_i32x4_arr113st_i32x4_arr1(<vscale x 2 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z18test_st_i32x4_arr113st_i32x4_arr1(<vscale x 2 x i32> %arg.coerce)
 [[riscv::vls_cc]] void test_st_i32x4_arr1(struct st_i32x4_arr1 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z22test_st_i32x4_arr1_25613st_i32x4_arr1(<vscale x 1 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z22test_st_i32x4_arr1_25613st_i32x4_arr1(<vscale x 1 x i32> %arg.coerce)
 [[riscv::vls_cc(256)]] void test_st_i32x4_arr1_256(struct st_i32x4_arr1 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z18test_st_i32x4_arr413st_i32x4_arr4(<vscale x 8 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z18test_st_i32x4_arr413st_i32x4_arr4(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %arg.coerce)
 [[riscv::vls_cc]] void test_st_i32x4_arr4(struct st_i32x4_arr4 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z22test_st_i32x4_arr4_25613st_i32x4_arr4(<vscale x 4 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z22test_st_i32x4_arr4_25613st_i32x4_arr4(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %arg.coerce)
 [[riscv::vls_cc(256)]] void test_st_i32x4_arr4_256(struct st_i32x4_arr4 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z18test_st_i32x4_arr813st_i32x4_arr8(<vscale x 16 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z18test_st_i32x4_arr813st_i32x4_arr8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %arg.coerce)
 [[riscv::vls_cc]] void test_st_i32x4_arr8(struct st_i32x4_arr8 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z22test_st_i32x4_arr8_25613st_i32x4_arr8(<vscale x 8 x i32> %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z22test_st_i32x4_arr8_25613st_i32x4_arr8(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %arg.coerce)
 [[riscv::vls_cc(256)]] void test_st_i32x4_arr8_256(struct st_i32x4_arr8 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x210st_i32x4x2(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x210st_i32x4x2(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg.coerce)
 [[riscv::vls_cc]] void test_st_i32x4x2(struct st_i32x4x2 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x2_25610st_i32x4x2(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x2_25610st_i32x4x2(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %arg.coerce)
 [[riscv::vls_cc(256)]] void test_st_i32x4x2_256(struct st_i32x4x2 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x8x210st_i32x8x2(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x8x210st_i32x8x2(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %arg.coerce)
 [[riscv::vls_cc]] void test_st_i32x8x2(struct st_i32x8x2 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x8x2_25610st_i32x8x2(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x8x2_25610st_i32x8x2(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg.coerce)
 [[riscv::vls_cc(256)]] void test_st_i32x8x2_256(struct st_i32x8x2 arg) {}
 
 // CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z16test_st_i32x64x211st_i32x64x2(ptr noundef %arg)
@@ -158,14 +158,14 @@ typedef int __attribute__((vector_size(256))) int32x64_t;
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z20test_st_i32x64x2_25611st_i32x64x2(ptr noundef %arg)
 [[riscv::vls_cc(256)]] void test_st_i32x64x2_256(struct st_i32x64x2 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x310st_i32x4x3(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x310st_i32x4x3(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %arg.coerce)
 [[riscv::vls_cc]] void test_st_i32x4x3(struct st_i32x4x3 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x3_25610st_i32x4x3(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x3_25610st_i32x4x3(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %arg.coerce)
 [[riscv::vls_cc(256)]] void test_st_i32x4x3_256(struct st_i32x4x3 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x810st_i32x4x8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x810st_i32x4x8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %arg.coerce)
 [[riscv::vls_cc]] void test_st_i32x4x8(struct st_i32x4x8 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x8_25610st_i32x4x8(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x8_25610st_i32x4x8(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %arg.coerce)
 [[riscv::vls_cc(256)]] void test_st_i32x4x8_256(struct st_i32x4x8 arg) {}
 
 // CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x910st_i32x4x9(ptr noundef %arg)