[clang] [Clang] Remove 3-element vector load and store special handling (PR #104661)
Shilei Tian via cfe-commits
cfe-commits at lists.llvm.org
Sun Dec 29 19:37:54 PST 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/104661
>From 9dfb68ddc02bb70b3c9ff9d5aeedabbc2feed258 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 17 Dec 2024 23:50:06 -0500
Subject: [PATCH] [Clang] Remove 3-element vector load and store special
handling
Clang uses a long-time special handling of the case where 3 element vector loads
and stores are performed as 4 element, and then a shufflevector is used to extract
the used elements. Odd sized vector codegen should now work reasonably well.
This patch removes this special handling, as well as the compiler argument
`-fpreserve-vec3-type`.
---
clang/include/clang/Basic/CodeGenOptions.def | 3 -
clang/include/clang/Basic/LangOptions.def | 2 +
clang/include/clang/Driver/Options.td | 4 -
clang/lib/Basic/LangOptions.cpp | 2 +
clang/lib/CodeGen/ABIInfo.cpp | 8 ++
clang/lib/CodeGen/ABIInfo.h | 7 ++
clang/lib/CodeGen/CGExpr.cpp | 48 ++++++------
clang/lib/CodeGen/Targets/AMDGPU.cpp | 6 ++
.../test/CodeGenCXX/matrix-vector-bit-int.cpp | 32 ++++----
clang/test/CodeGenOpenCL/amdgpu-alignment.cl | 28 +++----
clang/test/CodeGenOpenCL/preserve_vec3.cl | 77 -------------------
11 files changed, 78 insertions(+), 139 deletions(-)
delete mode 100644 clang/test/CodeGenOpenCL/preserve_vec3.cl
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 0f4ed13d5f3d8c..1ab8c7fb4d3c33 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -413,9 +413,6 @@ CODEGENOPT(StrictReturn, 1, 1)
/// Whether emit pseudo probes for sample pgo profile collection.
CODEGENOPT(PseudoProbeForProfiling, 1, 0)
-/// Whether 3-component vector type is preserved.
-CODEGENOPT(PreserveVec3Type, 1, 0)
-
CODEGENOPT(NoPLT, 1, 0)
/// Whether to emit all vtables
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 3b833240e5b68c..a980be853d53e6 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -532,6 +532,8 @@ BENIGN_LANGOPT(CheckConstexprFunctionBodies, 1, 1,
LANGOPT(BoundsSafety, 1, 0, "Bounds safety extension for C")
+LANGOPT(PreserveVec3Type, 1, 0, "Preserve 3-component vector type")
+
#undef LANGOPT
#undef COMPATIBLE_LANGOPT
#undef BENIGN_LANGOPT
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d922709db17786..38d8a255fedec4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8215,10 +8215,6 @@ def fhlsl_strict_availability : Flag<["-"], "fhlsl-strict-availability">,
Group<hlsl_Group>,
MarshallingInfoFlag<LangOpts<"HLSLStrictAvailability">>;
-def fpreserve_vec3_type : Flag<["-"], "fpreserve-vec3-type">,
- HelpText<"Preserve 3-component vector type">,
- MarshallingInfoFlag<CodeGenOpts<"PreserveVec3Type">>,
- ImpliedByAnyOf<[hlsl.KeyPath]>;
def fwchar_type_EQ : Joined<["-"], "fwchar-type=">,
HelpText<"Select underlying type for wchar_t">,
Values<"char,short,int">,
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index 94caf6a3897bc1..e3037ec819add2 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -208,6 +208,8 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,
// OpenCL and HLSL have half keyword
Opts.Half = Opts.OpenCL || Opts.HLSL;
+
+ Opts.PreserveVec3Type = Opts.HLSL;
}
FPOptions FPOptions::defaultWithoutTrailingStorage(const LangOptions &LO) {
diff --git a/clang/lib/CodeGen/ABIInfo.cpp b/clang/lib/CodeGen/ABIInfo.cpp
index 8e76cf15b642c6..3d6c793d2b6759 100644
--- a/clang/lib/CodeGen/ABIInfo.cpp
+++ b/clang/lib/CodeGen/ABIInfo.cpp
@@ -236,6 +236,14 @@ void ABIInfo::appendAttributeMangling(StringRef AttrStr,
}
}
+llvm::FixedVectorType *
+ABIInfo::getOptimalVectorType(llvm::FixedVectorType *T,
+ const LangOptions &Opt) const {
+ if (T->getNumElements() == 3 && !Opt.PreserveVec3Type)
+ return llvm::FixedVectorType::get(T->getElementType(), 4);
+ return T;
+}
+
// Pin the vtable to this file.
SwiftABIInfo::~SwiftABIInfo() = default;
diff --git a/clang/lib/CodeGen/ABIInfo.h b/clang/lib/CodeGen/ABIInfo.h
index b8a8de57e5b971..13842fe43eb65f 100644
--- a/clang/lib/CodeGen/ABIInfo.h
+++ b/clang/lib/CodeGen/ABIInfo.h
@@ -20,6 +20,7 @@ class Value;
class LLVMContext;
class DataLayout;
class Type;
+class FixedVectorType;
} // namespace llvm
namespace clang {
@@ -123,6 +124,12 @@ class ABIInfo {
raw_ostream &Out) const;
virtual void appendAttributeMangling(StringRef AttrStr,
raw_ostream &Out) const;
+
+ /// Returns the optimal vector type based on the given vector type. For
+ /// example, on certain targets, a vector with 3 elements might be promoted to
+ /// one with 4 elements to improve performance.
+ virtual llvm::FixedVectorType *
+ getOptimalVectorType(llvm::FixedVectorType *T, const LangOptions &Opt) const;
};
/// Target specific hooks for defining how a type should be passed or returned
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index ba1cba291553b0..c4a816e41f28c5 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1998,20 +1998,18 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
return EmitFromMemory(V, Ty);
}
- // Handle vectors of size 3 like size 4 for better performance.
- const llvm::Type *EltTy = Addr.getElementType();
- const auto *VTy = cast<llvm::FixedVectorType>(EltTy);
-
- if (!CGM.getCodeGenOpts().PreserveVec3Type && VTy->getNumElements() == 3) {
-
- llvm::VectorType *vec4Ty =
- llvm::FixedVectorType::get(VTy->getElementType(), 4);
- Address Cast = Addr.withElementType(vec4Ty);
- // Now load value.
- llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVec4");
-
- // Shuffle vector to get vec3.
- V = Builder.CreateShuffleVector(V, ArrayRef<int>{0, 1, 2}, "extractVec");
+ // Handles vectors of sizes that are likely to be expanded to a larger size
+ // to optimize performance.
+ auto *VTy = cast<llvm::FixedVectorType>(Addr.getElementType());
+ auto *NewVecTy = CGM.getABIInfo().getOptimalVectorType(VTy, getLangOpts());
+
+ if (VTy != NewVecTy) {
+ Address Cast = Addr.withElementType(NewVecTy);
+ llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVecN");
+ unsigned OldNumElements = VTy->getNumElements();
+ SmallVector<int, 4> Mask(OldNumElements);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ V = Builder.CreateShuffleVector(V, Mask, "extractVec");
return EmitFromMemory(V, Ty);
}
}
@@ -2141,21 +2139,21 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
Addr = Addr.withPointer(Builder.CreateThreadLocalAddress(GV),
NotKnownNonNull);
+ // Handles vectors of sizes that are likely to be expanded to a larger size
+ // to optimize performance.
llvm::Type *SrcTy = Value->getType();
if (const auto *ClangVecTy = Ty->getAs<VectorType>()) {
- auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy);
- if (!CGM.getCodeGenOpts().PreserveVec3Type) {
- // Handle vec3 special.
- if (VecTy && !ClangVecTy->isExtVectorBoolType() &&
- cast<llvm::FixedVectorType>(VecTy)->getNumElements() == 3) {
- // Our source is a vec3, do a shuffle vector to make it a vec4.
- Value = Builder.CreateShuffleVector(Value, ArrayRef<int>{0, 1, 2, -1},
- "extractVec");
- SrcTy = llvm::FixedVectorType::get(VecTy->getElementType(), 4);
+ if (auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
+ auto *NewVecTy =
+ CGM.getABIInfo().getOptimalVectorType(VecTy, getLangOpts());
+ if (!ClangVecTy->isExtVectorBoolType() && VecTy != NewVecTy) {
+ SmallVector<int, 4> Mask(NewVecTy->getNumElements(), -1);
+ std::iota(Mask.begin(), Mask.begin() + VecTy->getNumElements(), 0);
+ Value = Builder.CreateShuffleVector(Value, Mask, "extractVec");
+ SrcTy = NewVecTy;
}
- if (Addr.getElementType() != SrcTy) {
+ if (Addr.getElementType() != SrcTy)
Addr = Addr.withElementType(SrcTy);
- }
}
}
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 56ad0503a11ab2..52f0881dbdd8d1 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -52,6 +52,12 @@ class AMDGPUABIInfo final : public DefaultABIInfo {
void computeInfo(CGFunctionInfo &FI) const override;
RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
AggValueSlot Slot) const override;
+
+ llvm::FixedVectorType *
+ getOptimalVectorType(llvm::FixedVectorType *T,
+ const LangOptions &) const override {
+ return T;
+ }
};
bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
index ffbce9ff8d6f4c..7dc3b6bd598221 100644
--- a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
+++ b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
@@ -15,14 +15,14 @@ using i512x3x3 = _BitInt(512) __attribute__((matrix_type(3, 3)));
// CHECK-NEXT: [[A:%.*]] = alloca <3 x i8>, align 4
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i8>, align 4
// CHECK-NEXT: store i32 [[A_COERCE]], ptr [[A]], align 4
-// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i8>, ptr [[A]], align 4
-// CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVEC4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i8>, ptr [[A]], align 4
+// CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVECN]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
-// CHECK-NEXT: [[LOADVEC42:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
-// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVEC42]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT: [[LOADVEC44:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
-// CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVEC44]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVECN2]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT: [[LOADVECN4:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVECN4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i8> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
// CHECK-NEXT: store <3 x i8> [[ADD]], ptr [[RETVAL]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
@@ -38,10 +38,10 @@ i8x3 v1(i8x3 a) {
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i32>, align 16
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16
-// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVEC4]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT: [[LOADVEC42:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
-// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVEC42]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVECN]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVECN2]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i32> [[EXTRACTVEC1]], [[EXTRACTVEC3]]
// CHECK-NEXT: ret <3 x i32> [[ADD]]
//
@@ -53,14 +53,14 @@ i32x3 v2(i32x3 a) {
// CHECK-SAME: ptr noundef byval(<3 x i512>) align 256 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i512>, align 256
-// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
-// CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVEC4]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
+// CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVECN]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256
-// CHECK-NEXT: [[LOADVEC41:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
-// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVEC41]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT: [[LOADVEC43:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
-// CHECK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVEC43]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT: [[LOADVECN1:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
+// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVECN1]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT: [[LOADVECN3:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
+// CHECK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVECN3]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i512> [[EXTRACTVEC2]], [[EXTRACTVEC4]]
// CHECK-NEXT: ret <3 x i512> [[ADD]]
//
diff --git a/clang/test/CodeGenOpenCL/amdgpu-alignment.cl b/clang/test/CodeGenOpenCL/amdgpu-alignment.cl
index 8f57713fe1f041..52a5d01588875f 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-alignment.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-alignment.cl
@@ -94,43 +94,43 @@ typedef double __attribute__((ext_vector_type(16))) double16;
// CHECK-LABEL: @local_memory_alignment_global(
// CHECK: store volatile i8 0, ptr addrspace(3) @local_memory_alignment_global.lds_i8, align 1
// CHECK: store volatile <2 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i8, align 2
-// CHECK: store volatile <4 x i8> <i8 0, i8 0, i8 0, i8 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i8, align 4
+// CHECK: store volatile <3 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i8, align 4
// CHECK: store volatile <4 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i8, align 4
// CHECK: store volatile <8 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i8, align 8
// CHECK: store volatile <16 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i8, align 16
// CHECK: store volatile i16 0, ptr addrspace(3) @local_memory_alignment_global.lds_i16, align 2
// CHECK: store volatile <2 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i16, align 4
-// CHECK: store volatile <4 x i16> <i16 0, i16 0, i16 0, i16 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i16, align 8
+// CHECK: store volatile <3 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i16, align 8
// CHECK: store volatile <4 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i16, align 8
// CHECK: store volatile <8 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i16, align 16
// CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i16, align 32
// CHECK: store volatile i32 0, ptr addrspace(3) @local_memory_alignment_global.lds_i32, align 4
// CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i32, align 8
-// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
+// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
// CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i32, align 16
// CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i32, align 32
// CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i32, align 64
// CHECK: store volatile i64 0, ptr addrspace(3) @local_memory_alignment_global.lds_i64, align 8
// CHECK: store volatile <2 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i64, align 16
-// CHECK: store volatile <4 x i64> <i64 0, i64 0, i64 0, i64 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i64, align 32
+// CHECK: store volatile <3 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i64, align 32
// CHECK: store volatile <4 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i64, align 32
// CHECK: store volatile <8 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i64, align 64
// CHECK: store volatile <16 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i64, align 128
// CHECK: store volatile half 0xH0000, ptr addrspace(3) @local_memory_alignment_global.lds_f16, align 2
// CHECK: store volatile <2 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f16, align 4
-// CHECK: store volatile <4 x half> <half 0xH0000, half 0xH0000, half 0xH0000, half undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f16, align 8
+// CHECK: store volatile <3 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f16, align 8
// CHECK: store volatile <4 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f16, align 8
// CHECK: store volatile <8 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f16, align 16
// CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f16, align 32
// CHECK: store volatile float 0.000000e+00, ptr addrspace(3) @local_memory_alignment_global.lds_f32, align 4
// CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f32, align 8
-// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
+// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
// CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f32, align 16
// CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f32, align 32
// CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f32, align 64
// CHECK: store volatile double 0.000000e+00, ptr addrspace(3) @local_memory_alignment_global.lds_f64, align 8
// CHECK: store volatile <2 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f64, align 16
-// CHECK: store volatile <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f64, align 32
+// CHECK: store volatile <3 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f64, align 32
// CHECK: store volatile <4 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f64, align 32
// CHECK: store volatile <8 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f64, align 64
// CHECK: store volatile <16 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f64, align 128
@@ -381,43 +381,43 @@ kernel void local_memory_alignment_arg(
// CHECK: store volatile i8 0, ptr addrspace(5) %arraydecay, align 1
// CHECK: store volatile <2 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 2
-// CHECK: store volatile <4 x i8> <i8 0, i8 0, i8 0, i8 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
+// CHECK: store volatile <3 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
// CHECK: store volatile <4 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
// CHECK: store volatile <8 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <16 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile i16 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 2
// CHECK: store volatile <2 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
-// CHECK: store volatile <4 x i16> <i16 0, i16 0, i16 0, i16 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
+// CHECK: store volatile <3 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <4 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <8 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile i32 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
// CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
-// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
+// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
// CHECK: store volatile i64 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <2 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
-// CHECK: store volatile <4 x i64> <i64 0, i64 0, i64 0, i64 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
+// CHECK: store volatile <3 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile <4 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile <8 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
// CHECK: store volatile <16 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 128
// CHECK: store volatile half 0xH0000, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 2
// CHECK: store volatile <2 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
-// CHECK: store volatile <4 x half> <half 0xH0000, half 0xH0000, half 0xH0000, half undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
+// CHECK: store volatile <3 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <4 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <8 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile float 0.000000e+00, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
// CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
-// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
+// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
// CHECK: store volatile double 0.000000e+00, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <2 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
-// CHECK: store volatile <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
+// CHECK: store volatile <3 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile <4 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile <8 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
// CHECK: store volatile <16 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 128
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
deleted file mode 100644
index 19f0cdff60a9d6..00000000000000
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -fpreserve-vec3-type | FileCheck %s
-
-typedef char char3 __attribute__((ext_vector_type(3)));
-typedef char char8 __attribute__((ext_vector_type(8)));
-typedef short short3 __attribute__((ext_vector_type(3)));
-typedef double double2 __attribute__((ext_vector_type(2)));
-typedef float float3 __attribute__((ext_vector_type(3)));
-typedef float float4 __attribute__((ext_vector_type(4)));
-
-void kernel foo(global float3 *a, global float3 *b) {
- // CHECK-LABEL: spir_kernel void @foo
- // CHECK: %[[LOAD_A:.*]] = load <3 x float>, ptr addrspace(1) %a
- // CHECK: store <3 x float> %[[LOAD_A]], ptr addrspace(1) %b
- *b = *a;
-}
-
-void kernel float4_to_float3(global float3 *a, global float4 *b) {
- // CHECK-LABEL: spir_kernel void @float4_to_float3
- // CHECK: %[[LOAD_A:.*]] = load <4 x float>, ptr addrspace(1) %b, align 16
- // CHECK: %[[ASTYPE:.*]] = shufflevector <4 x float> %[[LOAD_A]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
- // CHECK: store <3 x float> %[[ASTYPE]], ptr addrspace(1) %a, align 16
- *a = __builtin_astype(*b, float3);
-}
-
-void kernel float3_to_float4(global float3 *a, global float4 *b) {
- // CHECK-LABEL: spir_kernel void @float3_to_float4
- // CHECK: %[[LOAD_A:.*]] = load <3 x float>, ptr addrspace(1) %a, align 16
- // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x float> %[[LOAD_A]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
- // CHECK: store <4 x float> %[[ASTYPE]], ptr addrspace(1) %b, align 16
- *b = __builtin_astype(*a, float4);
-}
-
-void kernel float3_to_double2(global float3 *a, global double2 *b) {
- // CHECK-LABEL: spir_kernel void @float3_to_double2
- // CHECK: %[[LOAD_A:.*]] = load <3 x float>, ptr addrspace(1) %a, align 16
- // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x float> %[[LOAD_A]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
- // CHECK: store <4 x float> %[[ASTYPE]], ptr addrspace(1) %b, align 16
- *b = __builtin_astype(*a, double2);
-}
-
-void kernel char8_to_short3(global short3 *a, global char8 *b) {
- // CHECK-LABEL: spir_kernel void @char8_to_short3
- // CHECK: %[[LOAD_B:.*]] = load <4 x i16>, ptr addrspace(1) %b
- // CHECK: %[[ASTYPE:.*]] = shufflevector <4 x i16> %[[LOAD_B]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
- // CHECK: store <3 x i16> %[[ASTYPE]], ptr addrspace(1) %a, align 8
- *a = __builtin_astype(*b, short3);
-}
-
-void from_char3(char3 a, global int *out) {
- // CHECK-LABEL: void @from_char3
- // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x i8> %a, <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
- // CHECK: store <4 x i8> %[[ASTYPE]], ptr addrspace(1) %out
- *out = __builtin_astype(a, int);
-}
-
-void from_short3(short3 a, global long *out) {
- // CHECK-LABEL: void @from_short3
- // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x i16> %a, <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
- // CHECK: store <4 x i16> %[[ASTYPE]], ptr addrspace(1) %out
- *out = __builtin_astype(a, long);
-}
-
-void scalar_to_char3(int a, global char3 *out) {
- // CHECK-LABEL: void @scalar_to_char3
- // CHECK: %[[IN_BC:.*]] = bitcast i32 %a to <4 x i8>
- // CHECK: %[[ASTYPE:.*]] = shufflevector <4 x i8> %[[IN_BC]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
- // CHECK: store <3 x i8> %[[ASTYPE]], ptr addrspace(1) %out
- *out = __builtin_astype(a, char3);
-}
-
-void scalar_to_short3(long a, global short3 *out) {
- // CHECK-LABEL: void @scalar_to_short3
- // CHECK: %[[IN_BC:.*]] = bitcast i64 %a to <4 x i16>
- // CHECK: %[[ASTYPE:.*]] = shufflevector <4 x i16> %[[IN_BC]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
- // CHECK: store <3 x i16> %[[ASTYPE]], ptr addrspace(1) %out
- *out = __builtin_astype(a, short3);
-}
More information about the cfe-commits
mailing list