[llvm] [AMDGPU] Handle vector types for reqd_work_group_size constant folding. (PR #179551)
Marcos Maronas via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 4 13:50:14 PST 2026
https://github.com/maarquitos14 updated https://github.com/llvm/llvm-project/pull/179551
>From 4a17438af66e7f9f4eaaff64a3a13c4a34edc5dc Mon Sep 17 00:00:00 2001
From: Marcos Maronas <mmaronas at amd.com>
Date: Tue, 3 Feb 2026 15:23:59 -0600
Subject: [PATCH 1/5] [AMDGPU] Handle vector types for reqd_work_group_size
constant folding.
---
.../AMDGPU/AMDGPULowerKernelAttributes.cpp | 23 +++-
...gpu-lower-kernel-attributes-vector-load.ll | 124 ++++++++++++++++++
2 files changed, 144 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-kernel-attributes-vector-load.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index fbfb71059b6b1..d8c8972be4235 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -378,10 +378,27 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
if (!GroupSize)
continue;
+ Type *GroupSizeType = GroupSize->getType();
ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
- GroupSize->replaceAllUsesWith(
- ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
- MadeChange = true;
+ Constant *Replacement = nullptr;
+
+ // Handle scalar integer types
+ if (GroupSizeType->isIntegerTy(16)) {
+ Replacement = ConstantFoldIntegerCast(KnownSize, GroupSizeType, false, DL);
+ }
+ else if (auto *VecTy = dyn_cast<VectorType>(GroupSizeType)) {
+ if (VecTy->getElementCount().isScalar()) {
+ Constant *CastElt = ConstantFoldIntegerCast(
+ KnownSize, VecTy->getElementType(), false, DL);
+ Replacement = ConstantDataVector::getSplat(
+ VecTy->getElementCount().getKnownMinValue(), CastElt);
+ }
+ }
+
+ if (Replacement) {
+ GroupSize->replaceAllUsesWith(Replacement);
+ MadeChange = true;
+ }
}
return MadeChange;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-kernel-attributes-vector-load.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-kernel-attributes-vector-load.ll
new file mode 100644
index 0000000000000..2a8059e43d5c0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-kernel-attributes-vector-load.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+
+; Test that we correctly handle vector load types when folding with reqd_work_group_size.
+; This tests the fix for a crash where <1 x i16> vector types would cause an assertion
+; failure in ConstantExpr::getCast because we tried to cast a scalar constant to a vector type.
+
+; Single-element vector <1 x i16> should be folded.
+; CHECK-LABEL: @load_group_size_x_v1i16(
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GROUP_SIZE_X:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
+; CHECK-NEXT: [[GROUP_SIZE_X:%.*]] = load <1 x i16>, ptr addrspace(4) [[GEP_GROUP_SIZE_X]], align 2
+; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds <1 x i16>, ptr addrspace(1) [[OUT:%.*]], i64 0
+; CHECK-NEXT: store <1 x i16> splat (i16 8), ptr addrspace(1) [[OUT_PTR]], align 2
+; CHECK-NEXT: ret void
+;
+define amdgpu_kernel void @load_group_size_x_v1i16(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 12
+ %group.size.x = load <1 x i16>, ptr addrspace(4) %gep.group.size.x, align 2
+ %out.ptr = getelementptr inbounds <1 x i16>, ptr addrspace(1) %out, i64 0
+ store <1 x i16> %group.size.x, ptr addrspace(1) %out.ptr, align 2
+ ret void
+}
+
+; Multi-element vector <2 x i16> should NOT be folded (semantically incorrect - can't cast one int to two).
+; CHECK-LABEL: @load_group_size_x_v2i16(
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GROUP_SIZE_X:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
+; CHECK-NEXT: [[GROUP_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[GEP_GROUP_SIZE_X]], align 2
+; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[OUT:%.*]], i64 0
+; CHECK-NEXT: store <2 x i16> [[GROUP_SIZE_X]], ptr addrspace(1) [[OUT_PTR]], align 4
+; CHECK-NEXT: ret void
+;
+define amdgpu_kernel void @load_group_size_x_v2i16(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 12
+ %group.size.x = load <2 x i16>, ptr addrspace(4) %gep.group.size.x, align 2
+ %out.ptr = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 0
+ store <2 x i16> %group.size.x, ptr addrspace(1) %out.ptr, align 4
+ ret void
+}
+
+; Single-element vector <1 x i32> load from i16 position - should NOT be folded (wrong size).
+; CHECK-LABEL: @load_group_size_y_v1i32(
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GROUP_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14
+; CHECK-NEXT: [[GROUP_SIZE_Y:%.*]] = load <1 x i32>, ptr addrspace(4) [[GEP_GROUP_SIZE_Y]], align 4
+; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds <1 x i32>, ptr addrspace(1) [[OUT:%.*]], i64 0
+; CHECK-NEXT: store <1 x i32> [[GROUP_SIZE_Y]], ptr addrspace(1) [[OUT_PTR]], align 4
+; CHECK-NEXT: ret void
+;
+define amdgpu_kernel void @load_group_size_y_v1i32(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 14
+ %group.size.y = load <1 x i32>, ptr addrspace(4) %gep.group.size.y, align 4
+ %out.ptr = getelementptr inbounds <1 x i32>, ptr addrspace(1) %out, i64 0
+ store <1 x i32> %group.size.y, ptr addrspace(1) %out.ptr, align 4
+ ret void
+}
+
+; Multi-element vector <2 x i32> should NOT be folded.
+; CHECK-LABEL: @load_group_size_y_v2i32(
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GROUP_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14
+; CHECK-NEXT: [[GROUP_SIZE_Y:%.*]] = load <2 x i32>, ptr addrspace(4) [[GEP_GROUP_SIZE_Y]], align 4
+; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(1) [[OUT:%.*]], i64 0
+; CHECK-NEXT: store <2 x i32> [[GROUP_SIZE_Y]], ptr addrspace(1) [[OUT_PTR]], align 8
+; CHECK-NEXT: ret void
+;
+define amdgpu_kernel void @load_group_size_y_v2i32(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 14
+ %group.size.y = load <2 x i32>, ptr addrspace(4) %gep.group.size.y, align 4
+ %out.ptr = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i64 0
+ store <2 x i32> %group.size.y, ptr addrspace(1) %out.ptr, align 8
+ ret void
+}
+
+; Single-element vector <1 x i64> load from i16 position - should NOT be folded (wrong size).
+; CHECK-LABEL: @load_group_size_z_v1i64(
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GROUP_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
+; CHECK-NEXT: [[GROUP_SIZE_Z:%.*]] = load <1 x i64>, ptr addrspace(4) [[GEP_GROUP_SIZE_Z]], align 8
+; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds <1 x i64>, ptr addrspace(1) [[OUT:%.*]], i64 0
+; CHECK-NEXT: store <1 x i64> [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT_PTR]], align 8
+; CHECK-NEXT: ret void
+;
+define amdgpu_kernel void @load_group_size_z_v1i64(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.group.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 16
+ %group.size.z = load <1 x i64>, ptr addrspace(4) %gep.group.size.z, align 8
+ %out.ptr = getelementptr inbounds <1 x i64>, ptr addrspace(1) %out, i64 0
+ store <1 x i64> %group.size.z, ptr addrspace(1) %out.ptr, align 8
+ ret void
+}
+
+; Multi-element vector <2 x i64> should NOT be folded.
+; CHECK-LABEL: @load_group_size_z_v2i64(
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GROUP_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
+; CHECK-NEXT: [[GROUP_SIZE_Z:%.*]] = load <2 x i64>, ptr addrspace(4) [[GEP_GROUP_SIZE_Z]], align 8
+; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds <2 x i64>, ptr addrspace(1) [[OUT:%.*]], i64 0
+; CHECK-NEXT: store <2 x i64> [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT_PTR]], align 16
+; CHECK-NEXT: ret void
+;
+define amdgpu_kernel void @load_group_size_z_v2i64(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.group.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 16
+ %group.size.z = load <2 x i64>, ptr addrspace(4) %gep.group.size.z, align 8
+ %out.ptr = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 0
+ store <2 x i64> %group.size.z, ptr addrspace(1) %out.ptr, align 16
+ ret void
+}
+
+declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
+
+attributes #0 = { nounwind "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.module.flags = !{!1}
+
+!0 = !{i32 8, i32 16, i32 2}
+!1 = !{i32 1, !"amdgpu_code_object_version", i32 500}
>From 9934f00a82517cb4bf67cec093ce6c08a8830b03 Mon Sep 17 00:00:00 2001
From: Marcos Maronas <mmaronas at amd.com>
Date: Wed, 4 Feb 2026 07:51:15 -0600
Subject: [PATCH 2/5] Simplification.
---
.../AMDGPU/AMDGPULowerKernelAttributes.cpp | 18 +++++++-----------
1 file changed, 7 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index d8c8972be4235..be5ef8cd1e05f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -382,17 +382,13 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
Constant *Replacement = nullptr;
- // Handle scalar integer types
- if (GroupSizeType->isIntegerTy(16)) {
- Replacement = ConstantFoldIntegerCast(KnownSize, GroupSizeType, false, DL);
- }
- else if (auto *VecTy = dyn_cast<VectorType>(GroupSizeType)) {
- if (VecTy->getElementCount().isScalar()) {
- Constant *CastElt = ConstantFoldIntegerCast(
- KnownSize, VecTy->getElementType(), false, DL);
- Replacement = ConstantDataVector::getSplat(
- VecTy->getElementCount().getKnownMinValue(), CastElt);
- }
+ if (auto *VecTy = dyn_cast<VectorType>(GroupSizeType)) {
+ Constant *CastElt = ConstantFoldIntegerCast(
+ KnownSize, VecTy->getElementType(), false, DL);
+ Replacement = ConstantVector::getSplat(VecTy->getElementCount(), CastElt);
+ } else {
+ Replacement =
+ ConstantFoldIntegerCast(KnownSize, GroupSizeType, false, DL);
}
if (Replacement) {
>From a3243cb17a2a316c5ed5ee88b6b2534526820e4b Mon Sep 17 00:00:00 2001
From: Marcos Maronas <mmaronas at amd.com>
Date: Wed, 4 Feb 2026 09:17:29 -0600
Subject: [PATCH 3/5] Add new tests and fix for them to pass.
---
.../AMDGPU/AMDGPULowerKernelAttributes.cpp | 8 +++--
...gpu-lower-kernel-attributes-vector-load.ll | 36 +++++++++++++++++++
2 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index be5ef8cd1e05f..99b23936db669 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -383,9 +383,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
Constant *Replacement = nullptr;
if (auto *VecTy = dyn_cast<VectorType>(GroupSizeType)) {
- Constant *CastElt = ConstantFoldIntegerCast(
- KnownSize, VecTy->getElementType(), false, DL);
- Replacement = ConstantVector::getSplat(VecTy->getElementCount(), CastElt);
+ if (VecTy->getElementCount().isScalar()) {
+ Constant *CastElt = ConstantFoldIntegerCast(
+ KnownSize, VecTy->getElementType(), false, DL);
+ Replacement = ConstantVector::getSplat(VecTy->getElementCount(), CastElt);
+ }
} else {
Replacement =
ConstantFoldIntegerCast(KnownSize, GroupSizeType, false, DL);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-kernel-attributes-vector-load.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-kernel-attributes-vector-load.ll
index 2a8059e43d5c0..2339b03b34f46 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-kernel-attributes-vector-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-kernel-attributes-vector-load.ll
@@ -113,6 +113,42 @@ define amdgpu_kernel void @load_group_size_z_v2i64(ptr addrspace(1) %out) #0 !re
ret void
}
+; Multi-element vector <2 x i8> (16 bits total) should NOT be folded.
+; CHECK-LABEL: @load_group_size_x_v2i8(
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GROUP_SIZE_X:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
+; CHECK-NEXT: [[GROUP_SIZE_X:%.*]] = load <2 x i8>, ptr addrspace(4) [[GEP_GROUP_SIZE_X]], align 2
+; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds <2 x i8>, ptr addrspace(1) [[OUT:%.*]], i64 0
+; CHECK-NEXT: store <2 x i8> [[GROUP_SIZE_X]], ptr addrspace(1) [[OUT_PTR]], align 2
+; CHECK-NEXT: ret void
+;
+define amdgpu_kernel void @load_group_size_x_v2i8(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 12
+ %group.size.x = load <2 x i8>, ptr addrspace(4) %gep.group.size.x, align 2
+ %out.ptr = getelementptr inbounds <2 x i8>, ptr addrspace(1) %out, i64 0
+ store <2 x i8> %group.size.x, ptr addrspace(1) %out.ptr, align 2
+ ret void
+}
+
+; Multi-element vector <4 x i4> (16 bits total) should NOT be folded.
+; CHECK-LABEL: @load_group_size_y_v4i4(
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GROUP_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14
+; CHECK-NEXT: [[GROUP_SIZE_Y:%.*]] = load <4 x i4>, ptr addrspace(4) [[GEP_GROUP_SIZE_Y]], align 2
+; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds <4 x i4>, ptr addrspace(1) [[OUT:%.*]], i64 0
+; CHECK-NEXT: store <4 x i4> [[GROUP_SIZE_Y]], ptr addrspace(1) [[OUT_PTR]], align 2
+; CHECK-NEXT: ret void
+;
+define amdgpu_kernel void @load_group_size_y_v4i4(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 14
+ %group.size.y = load <4 x i4>, ptr addrspace(4) %gep.group.size.y, align 2
+ %out.ptr = getelementptr inbounds <4 x i4>, ptr addrspace(1) %out, i64 0
+ store <4 x i4> %group.size.y, ptr addrspace(1) %out.ptr, align 2
+ ret void
+}
+
declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
attributes #0 = { nounwind "uniform-work-group-size"="true" }
>From 0e3fb1ca3e4268c7a14169bd3f8f7f041998d02c Mon Sep 17 00:00:00 2001
From: Marcos Maronas <mmaronas at amd.com>
Date: Wed, 4 Feb 2026 09:20:50 -0600
Subject: [PATCH 4/5] Fix clang-format issue.
---
llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 99b23936db669..3f1b3c8322b50 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -386,7 +386,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
if (VecTy->getElementCount().isScalar()) {
Constant *CastElt = ConstantFoldIntegerCast(
KnownSize, VecTy->getElementType(), false, DL);
- Replacement = ConstantVector::getSplat(VecTy->getElementCount(), CastElt);
+ Replacement =
+ ConstantVector::getSplat(VecTy->getElementCount(), CastElt);
}
} else {
Replacement =
>From 3f6bec0b72b20f6aaa5de46e8de7f5aa9cd32d4d Mon Sep 17 00:00:00 2001
From: Marcos Maronas <mmaronas at amd.com>
Date: Wed, 4 Feb 2026 15:49:56 -0600
Subject: [PATCH 5/5] Sequence of cast+bitcast.
---
.../AMDGPU/AMDGPULowerKernelAttributes.cpp | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 3f1b3c8322b50..7a8adc69e8f88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -380,18 +380,14 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
Type *GroupSizeType = GroupSize->getType();
ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
- Constant *Replacement = nullptr;
+ Type *ScalarType = GroupSizeType->getScalarType();
+ Constant *Replacement =
+ ConstantFoldIntegerCast(KnownSize, ScalarType, false, DL);
if (auto *VecTy = dyn_cast<VectorType>(GroupSizeType)) {
- if (VecTy->getElementCount().isScalar()) {
- Constant *CastElt = ConstantFoldIntegerCast(
- KnownSize, VecTy->getElementType(), false, DL);
- Replacement =
- ConstantVector::getSplat(VecTy->getElementCount(), CastElt);
- }
- } else {
- Replacement =
- ConstantFoldIntegerCast(KnownSize, GroupSizeType, false, DL);
+ Replacement = VecTy->getElementCount().isScalar()
+ ? ConstantExpr::getBitCast(Replacement, GroupSizeType)
+ : nullptr;
}
if (Replacement) {
More information about the llvm-commits
mailing list