[llvm] [AMDGPU] Add folding ISD::SELECT from vXiY into vZi32 with X * Y = Z * 32 (PR #173328)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 20 20:09:15 PST 2026
https://github.com/Shoreshen updated https://github.com/llvm/llvm-project/pull/173328
>From f4b15af87917f8fbfa91698fbef33e4497305670 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 23 Dec 2025 10:05:31 +0800
Subject: [PATCH 1/9] Try to simplify select v32i4 case by legalizing v16i4
---
llvm/include/llvm/CodeGen/ValueTypes.td | 2 ++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
llvm/test/CodeGen/AMDGPU/select-vectors.ll | 15 +++++++++++++++
3 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 74ea86774a8ee..3ef87dc38a85d 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -113,6 +113,8 @@ def v4096i1 : VTVec<4096, i1>; // 4096 x i1 vector value
def v128i2 : VTVec<128, i2>; // 128 x i2 vector value
def v256i2 : VTVec<256, i2>; // 256 x i2 vector value
+def v16i4 : VTVec<16, i4>; // 16 x i4 vector value
+def v32i4 : VTVec<32, i4>; // 32 x i4 vector value
def v64i4 : VTVec<64, i4>; // 64 x i4 vector value
def v128i4 : VTVec<128, i4>; // 128 x i4 vector value
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 102ca92856bae..03d4f9c09dc2a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -876,7 +876,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
- MVT::v32f16, MVT::v32bf16},
+ MVT::v32f16, MVT::v32bf16, MVT::v16i4},
Custom);
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index e754f665c5f43..5e52b2fca32c8 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -65,6 +65,21 @@ define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
+; GCN-LABEL: {{^}}v_select_v32i4:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v32i4(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+ %a = load <32 x i4>, ptr addrspace(1) %a.ptr, align 2
+ %b = load <32 x i4>, ptr addrspace(4) %b.ptr, align 2
+ %cmp = icmp eq i32 %c, 0
+ %select = select i1 %cmp, <32 x i4> %a, <32 x i4> %b
+ store <32 x i4> %select, ptr addrspace(1) %out, align 2
+ ret void
+}
+
; GCN-LABEL: {{^}}select_v4i8:
; GFX89: s_cselect_b32
; GFX89-NOT: s_cselect_b32
>From 8652612ed2be9a2a48de01d81a27d785aeb8bd12 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Thu, 25 Dec 2025 17:35:35 +0800
Subject: [PATCH 2/9] fix part of tests
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +-
llvm/test/CodeGen/X86/pr15267.ll | 64 ++-----------------------
llvm/test/TableGen/CPtrWildcard.td | 4 +-
3 files changed, 7 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 811ffb090d751..2f1bab8411ad1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2789,7 +2789,8 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
return TypeSplitVector;
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
- VT.getVectorElementType() != MVT::i1)
+ VT.getVectorElementType() != MVT::i1 &&
+ VT.getVectorElementType() != MVT::i4)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll
index 5083eac71dce0..7292e24df9d82 100644
--- a/llvm/test/CodeGen/X86/pr15267.ll
+++ b/llvm/test/CodeGen/X86/pr15267.ll
@@ -82,67 +82,9 @@ define <4 x i64> @test3(ptr %in) nounwind {
define <16 x i4> @test4(ptr %in) nounwind {
; CHECK-LABEL: test4:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $4, %ecx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: andl $15, %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $8, %ecx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $12, %ecx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $16, %ecx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $20, %ecx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $24, %ecx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $28, %ecx
-; CHECK-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $36, %rcx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $40, %rcx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $44, %rcx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $48, %rcx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $52, %rcx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: shrq $60, %rax
-; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq (%rsi), %rcx
+; CHECK-NEXT: movq %rcx, (%rdi)
; CHECK-NEXT: retq
%ret = load <16 x i4>, ptr %in, align 1
ret <16 x i4> %ret
diff --git a/llvm/test/TableGen/CPtrWildcard.td b/llvm/test/TableGen/CPtrWildcard.td
index 867d6f85bdecb..358af86e2d2ef 100644
--- a/llvm/test/TableGen/CPtrWildcard.td
+++ b/llvm/test/TableGen/CPtrWildcard.td
@@ -8,13 +8,13 @@
// CHECK-NEXT:/* 3*/ OPC_CheckChild0Integer, [[#]],
// CHECK-NEXT:/* 5*/ OPC_RecordChild1, // #0 = $src
// CHECK-NEXT:/* 6*/ OPC_Scope, 9, /*->17*/ // 2 children in Scope
-// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/0|128,2/*256*/,
+// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/2|128,2/*258*/,
// CHECK-NEXT:/* 11*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C64_TO_I64),
// CHECK-NEXT: /*MVT::i64*/8, 1/*#Ops*/, 0,
// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c64:{ *:[c64] }:$src) - Complexity = 8
// CHECK-NEXT: // Dst: (C64_TO_I64:{ *:[i64] } ?:{ *:[c64] }:$src)
// CHECK-NEXT:/* 17*/ /*Scope*/ 9, /*->27*/
-// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/1|128,2/*257*/,
+// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/3|128,2/*259*/,
// CHECK-NEXT:/* 21*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C128_TO_I64),
// CHECK-NEXT: /*MVT::i64*/8, 1/*#Ops*/, 0,
// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c128:{ *:[c128] }:$src) - Complexity = 8
>From 836edd720245914767488f9deb4fb8863d122c0d Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 26 Dec 2025 10:10:06 +0800
Subject: [PATCH 3/9] fix x86 test case
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 3 ++
llvm/test/CodeGen/X86/pr15267.ll | 64 +++++++++++++++++++++++--
2 files changed, 64 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2f1bab8411ad1..0edbffbfbeb3c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1041,6 +1041,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (VT.getVectorElementType() == MVT::f16 ||
VT.getVectorElementType() == MVT::bf16)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+ if (VT.getVectorElementType() == MVT::i4)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
}
}
diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll
index 7292e24df9d82..5083eac71dce0 100644
--- a/llvm/test/CodeGen/X86/pr15267.ll
+++ b/llvm/test/CodeGen/X86/pr15267.ll
@@ -82,9 +82,67 @@ define <4 x i64> @test3(ptr %in) nounwind {
define <16 x i4> @test4(ptr %in) nounwind {
; CHECK-LABEL: test4:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq (%rsi), %rcx
-; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: movq (%rdi), %rax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $4, %ecx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $15, %edx
+; CHECK-NEXT: vmovd %edx, %xmm0
+; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $8, %ecx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $12, %ecx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $16, %ecx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $20, %ecx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $24, %ecx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $28, %ecx
+; CHECK-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq $32, %rcx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq $36, %rcx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq $40, %rcx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq $44, %rcx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq $48, %rcx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq $52, %rcx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq $56, %rcx
+; CHECK-NEXT: andl $15, %ecx
+; CHECK-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: shrq $60, %rax
+; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%ret = load <16 x i4>, ptr %in, align 1
ret <16 x i4> %ret
>From 2cd9a5d0d4dc922368865f32863b5c38e6535291 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 7 Jan 2026 10:29:29 +0800
Subject: [PATCH 4/9] fix test case
---
llvm/test/TableGen/CPtrWildcard.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/TableGen/CPtrWildcard.td b/llvm/test/TableGen/CPtrWildcard.td
index 79efc42af9634..e1a7a6e97c76a 100644
--- a/llvm/test/TableGen/CPtrWildcard.td
+++ b/llvm/test/TableGen/CPtrWildcard.td
@@ -8,13 +8,13 @@
// CHECK-NEXT:/* 3*/ OPC_CheckChild0Integer, [[#]],
// CHECK-NEXT:/* 5*/ OPC_RecordChild1, // #0 = $src
// CHECK-NEXT:/* 6*/ OPC_Scope, 9, /*->17*/ // 2 children in Scope
-// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/2|128,2/*258*/,
+// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/4|128,2/*260*/,
// CHECK-NEXT:/* 11*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C64_TO_I64),
// CHECK-NEXT: /*MVT::i64*/8, 1/*#Ops*/, 0,
// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c64:{ *:[c64] }:$src) - Complexity = 8
// CHECK-NEXT: // Dst: (C64_TO_I64:{ *:[i64] } ?:{ *:[c64] }:$src)
// CHECK-NEXT:/* 17*/ /*Scope*/ 9, /*->27*/
-// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/3|128,2/*259*/,
+// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/5|128,2/*261*/,
// CHECK-NEXT:/* 21*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C128_TO_I64),
// CHECK-NEXT: /*MVT::i64*/8, 1/*#Ops*/, 0,
// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c128:{ *:[c128] }:$src) - Complexity = 8
>From 3cbfe5649bdd5dbf98ff68330dc7ae5bf5ed7709 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 13 Jan 2026 09:28:08 +0800
Subject: [PATCH 5/9] fix test case
---
llvm/test/TableGen/CPtrWildcard.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/TableGen/CPtrWildcard.td b/llvm/test/TableGen/CPtrWildcard.td
index 163d150f88ab0..a50cd4e133d0e 100644
--- a/llvm/test/TableGen/CPtrWildcard.td
+++ b/llvm/test/TableGen/CPtrWildcard.td
@@ -7,13 +7,13 @@
// CHECK-NEXT: /* 0*/ OPC_CheckOpcode, TARGET_VAL(ISD::INTRINSIC_WO_CHAIN),
// CHECK-NEXT:/* 3*/ OPC_CheckChild0Integer, [[#]],
// CHECK-NEXT:/* 5*/ OPC_RecordChild1, // #0 = $src
-// CHECK-NEXT:/* 6*/ OPC_Scope, 9, /*->17*/ // 2 children in Scope
+// CHECK-NEXT:/* 6*/ OPC_Scope /*2 children */, 9, // ->17
// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/4|128,2/*260*/,
// CHECK-NEXT:/* 11*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C64_TO_I64),
// CHECK-NEXT: MVT::i64, 1/*#Ops*/, 0,
// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c64:{ *:[c64] }:$src) - Complexity = 8
// CHECK-NEXT: // Dst: (C64_TO_I64:{ *:[i64] } ?:{ *:[c64] }:$src)
-// CHECK-NEXT:/* 17*/ /*Scope*/ 9, /*->27*/
+// CHECK-NEXT:/* 17*/ /*Scope*/ 9, // ->27
// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/5|128,2/*261*/,
// CHECK-NEXT:/* 21*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C128_TO_I64),
// CHECK-NEXT: MVT::i64, 1/*#Ops*/, 0,
>From 3ae9fa5e4c52ca56c5e086e25a6ffe6ba0afa060 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 13 Jan 2026 17:47:20 +0800
Subject: [PATCH 6/9] pre convert selet to vZi32
---
llvm/include/llvm/CodeGen/ValueTypes.td | 2 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 73 ++++++++++++++++++-----
llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 ++
llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +-
llvm/test/TableGen/CPtrWildcard.td | 8 +--
5 files changed, 67 insertions(+), 26 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 9ca535d9af21d..d7f3c683e2fa2 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -113,8 +113,6 @@ def v4096i1 : VTVec<4096, i1>; // 4096 x i1 vector value
def v128i2 : VTVec<128, i2>; // 128 x i2 vector value
def v256i2 : VTVec<256, i2>; // 256 x i2 vector value
-def v16i4 : VTVec<16, i4>; // 16 x i4 vector value
-def v32i4 : VTVec<32, i4>; // 32 x i4 vector value
def v64i4 : VTVec<64, i4>; // 64 x i4 vector value
def v128i4 : VTVec<128, i4>; // 128 x i4 vector value
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7fde4ad7497e9..74314a895d7f7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -877,7 +877,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
- MVT::v32f16, MVT::v32bf16, MVT::v16i4},
+ MVT::v32f16, MVT::v32bf16},
Custom);
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
@@ -16986,20 +16986,16 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}
-SDValue SITargetLowering::performSelectCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
-
- // Try to fold CMP + SELECT patterns with shared constants (both FP and
- // integer).
- // Detect when CMP and SELECT use the same constant and fold them to avoid
- // loading the constant twice. Specifically handles patterns like:
- // %cmp = icmp eq i32 %val, 4242
- // %sel = select i1 %cmp, i32 4242, i32 %other
- // It can be optimized to reuse %val instead of 4242 in select.
- SDValue Cond = N->getOperand(0);
- SDValue TrueVal = N->getOperand(1);
- SDValue FalseVal = N->getOperand(2);
-
+// Try to fold CMP + SELECT patterns with shared constants (both FP and
+// integer).
+// Detect when CMP and SELECT use the same constant and fold them to avoid
+// loading the constant twice. Specifically handles patterns like:
+// %cmp = icmp eq i32 %val, 4242
+// %sel = select i1 %cmp, i32 4242, i32 %other
+// It can be optimized to reuse %val instead of 4242 in select.
+SDValue SITargetLowering::foldShareConstSelect(SDNode *N, DAGCombinerInfo &DCI,
+ SDValue &Cond, SDValue &TrueVal,
+ SDValue &FalseVal) const {
// Check if condition is a comparison.
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
@@ -17058,6 +17054,53 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
SelectLHS, SelectRHS);
}
+// Try to convert vXiY into vZi32 with X * Y = Z * 32
+SDValue SITargetLowering::castTypeSelect(SDNode *N, DAGCombinerInfo &DCI,
+ SDValue &Cond, SDValue &TrueVal,
+ SDValue &FalseVal) const {
+ if (N->getNumValues() != 1)
+ return SDValue();
+
+ EVT ResultVT = N->getValueType(0);
+ if (ResultVT.isSimple() || !ResultVT.isVector() ||
+ !ResultVT.isPow2VectorType())
+ return SDValue();
+
+ EVT EltVT = ResultVT.getVectorElementType();
+ unsigned EltBitSize = EltVT.getSizeInBits();
+ ElementCount NumElts = ResultVT.getVectorElementCount();
+ if (!EltVT.isInteger() || !isPowerOf2_32(EltBitSize) || NumElts.isScalar())
+ return SDValue();
+
+ unsigned NewNumElts = ResultVT.getVectorNumElements() / (32 / EltBitSize);
+ if (TrueVal.getValueType() == ResultVT &&
+ FalseVal.getValueType() == ResultVT) {
+ EVT NewVT = EVT::getVectorVT(*DCI.DAG.getContext(), MVT::i32, NewNumElts);
+ SDValue NewTrue =
+ DCI.DAG.getNode(ISD::BITCAST, SDLoc(TrueVal), NewVT, TrueVal);
+ SDValue NewFalse =
+ DCI.DAG.getNode(ISD::BITCAST, SDLoc(FalseVal), NewVT, FalseVal);
+ SDValue NewSelect =
+ DCI.DAG.getNode(ISD::SELECT, SDLoc(N), NewVT, Cond, NewTrue, NewFalse);
+ return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), ResultVT, NewSelect);
+ }
+ return SDValue();
+}
+
+SDValue SITargetLowering::performSelectCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ SDValue Cond = N->getOperand(0);
+ SDValue TrueVal = N->getOperand(1);
+ SDValue FalseVal = N->getOperand(2);
+
+ SDValue Res = foldShareConstSelect(N, DCI, Cond, TrueVal, FalseVal);
+ if (Res)
+ return Res;
+ else
+ return castTypeSelect(N, DCI, Cond, TrueVal, FalseVal);
+}
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e82f4528fcd09..31ba200b45831 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -227,6 +227,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue foldShareConstSelect(SDNode *N, DAGCombinerInfo &DCI, SDValue &Cond,
+ SDValue &TrueVal, SDValue &FalseVal) const;
+ SDValue castTypeSelect(SDNode *N, DAGCombinerInfo &DCI, SDValue &Cond,
+ SDValue &TrueVal, SDValue &FalseVal) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 13c508d599620..a354704c5958b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1050,9 +1050,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (VT.getVectorElementType() == MVT::f16 ||
VT.getVectorElementType() == MVT::bf16)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
-
- if (VT.getVectorElementType() == MVT::i4)
- setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
}
}
@@ -2800,8 +2797,7 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
return TypeSplitVector;
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
- VT.getVectorElementType() != MVT::i1 &&
- VT.getVectorElementType() != MVT::i4)
+ VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
diff --git a/llvm/test/TableGen/CPtrWildcard.td b/llvm/test/TableGen/CPtrWildcard.td
index a50cd4e133d0e..41373b2c2fe4c 100644
--- a/llvm/test/TableGen/CPtrWildcard.td
+++ b/llvm/test/TableGen/CPtrWildcard.td
@@ -7,14 +7,14 @@
// CHECK-NEXT: /* 0*/ OPC_CheckOpcode, TARGET_VAL(ISD::INTRINSIC_WO_CHAIN),
// CHECK-NEXT:/* 3*/ OPC_CheckChild0Integer, [[#]],
// CHECK-NEXT:/* 5*/ OPC_RecordChild1, // #0 = $src
-// CHECK-NEXT:/* 6*/ OPC_Scope /*2 children */, 9, // ->17
-// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/4|128,2/*260*/,
+// CHECK-NEXT:/* 6*/ OPC_Scope /*2 children */, 9, // ->17
+// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/2|128,2/*258*/,
// CHECK-NEXT:/* 11*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C64_TO_I64),
// CHECK-NEXT: MVT::i64, 1/*#Ops*/, 0,
// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c64:{ *:[c64] }:$src) - Complexity = 8
// CHECK-NEXT: // Dst: (C64_TO_I64:{ *:[i64] } ?:{ *:[c64] }:$src)
-// CHECK-NEXT:/* 17*/ /*Scope*/ 9, // ->27
-// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/5|128,2/*261*/,
+// CHECK-NEXT:/* 17*/ /*Scope*/ 9, // ->27
+// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/3|128,2/*259*/,
// CHECK-NEXT:/* 21*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C128_TO_I64),
// CHECK-NEXT: MVT::i64, 1/*#Ops*/, 0,
// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c128:{ *:[c128] }:$src) - Complexity = 8
>From d4df9639d6e7a0e7e164ea0da04dd9f1babc7408 Mon Sep 17 00:00:00 2001
From: Shoreshen <372660931 at qq.com>
Date: Wed, 21 Jan 2026 09:41:12 +0800
Subject: [PATCH 7/9] Update llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Co-authored-by: Craig Topper <craig.topper at sifive.com>
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9c859c33d89c0..0b8625fd6ce17 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17084,8 +17084,7 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
SDValue TrueVal = N->getOperand(1);
SDValue FalseVal = N->getOperand(2);
- SDValue Res = foldShareConstSelect(N, DCI, Cond, TrueVal, FalseVal);
- if (Res)
+ if (SDValue Res = foldShareConstSelect(N, DCI, Cond, TrueVal, FalseVal))
return Res;
else
return castTypeSelect(N, DCI, Cond, TrueVal, FalseVal);
>From 5f98054adbdd69a1de21c818a8b5eac25f008d1d Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 21 Jan 2026 10:12:20 +0800
Subject: [PATCH 8/9] fix comments
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 36 ++++++++++-------------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 8 ++---
2 files changed, 19 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fb9c1ac874baa..3585836cdbaac 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17007,8 +17007,8 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
// %sel = select i1 %cmp, i32 4242, i32 %other
// It can be optimized to reuse %val instead of 4242 in select.
SDValue SITargetLowering::foldShareConstSelect(SDNode *N, DAGCombinerInfo &DCI,
- SDValue &Cond, SDValue &TrueVal,
- SDValue &FalseVal) const {
+ SDValue Cond, SDValue TrueVal,
+ SDValue FalseVal) const {
// Check if condition is a comparison.
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
@@ -17069,11 +17069,8 @@ SDValue SITargetLowering::foldShareConstSelect(SDNode *N, DAGCombinerInfo &DCI,
// Try to convert vXiY into vZi32 with X * Y = Z * 32
SDValue SITargetLowering::castTypeSelect(SDNode *N, DAGCombinerInfo &DCI,
- SDValue &Cond, SDValue &TrueVal,
- SDValue &FalseVal) const {
- if (N->getNumValues() != 1)
- return SDValue();
-
+ SDValue Cond, SDValue TrueVal,
+ SDValue FalseVal) const {
EVT ResultVT = N->getValueType(0);
if (ResultVT.isSimple() || !ResultVT.isVector() ||
!ResultVT.isPow2VectorType())
@@ -17086,18 +17083,15 @@ SDValue SITargetLowering::castTypeSelect(SDNode *N, DAGCombinerInfo &DCI,
return SDValue();
unsigned NewNumElts = ResultVT.getVectorNumElements() / (32 / EltBitSize);
- if (TrueVal.getValueType() == ResultVT &&
- FalseVal.getValueType() == ResultVT) {
- EVT NewVT = EVT::getVectorVT(*DCI.DAG.getContext(), MVT::i32, NewNumElts);
- SDValue NewTrue =
- DCI.DAG.getNode(ISD::BITCAST, SDLoc(TrueVal), NewVT, TrueVal);
- SDValue NewFalse =
- DCI.DAG.getNode(ISD::BITCAST, SDLoc(FalseVal), NewVT, FalseVal);
- SDValue NewSelect =
- DCI.DAG.getNode(ISD::SELECT, SDLoc(N), NewVT, Cond, NewTrue, NewFalse);
- return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), ResultVT, NewSelect);
- }
- return SDValue();
+
+ EVT NewVT = EVT::getVectorVT(*DCI.DAG.getContext(), MVT::i32, NewNumElts);
+ SDValue NewTrue =
+ DCI.DAG.getNode(ISD::BITCAST, SDLoc(TrueVal), NewVT, TrueVal);
+ SDValue NewFalse =
+ DCI.DAG.getNode(ISD::BITCAST, SDLoc(FalseVal), NewVT, FalseVal);
+ SDValue NewSelect =
+ DCI.DAG.getNode(ISD::SELECT, SDLoc(N), NewVT, Cond, NewTrue, NewFalse);
+ return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), ResultVT, NewSelect);
}
SDValue SITargetLowering::performSelectCombine(SDNode *N,
@@ -17109,8 +17103,8 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
if (SDValue Res = foldShareConstSelect(N, DCI, Cond, TrueVal, FalseVal))
return Res;
- else
- return castTypeSelect(N, DCI, Cond, TrueVal, FalseVal);
+
+ return castTypeSelect(N, DCI, Cond, TrueVal, FalseVal);
}
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 0885c12f7ff9e..f47068fb72ecb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -227,10 +227,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue foldShareConstSelect(SDNode *N, DAGCombinerInfo &DCI, SDValue &Cond,
- SDValue &TrueVal, SDValue &FalseVal) const;
- SDValue castTypeSelect(SDNode *N, DAGCombinerInfo &DCI, SDValue &Cond,
- SDValue &TrueVal, SDValue &FalseVal) const;
+ SDValue foldShareConstSelect(SDNode *N, DAGCombinerInfo &DCI, SDValue Cond,
+ SDValue TrueVal, SDValue FalseVal) const;
+ SDValue castTypeSelect(SDNode *N, DAGCombinerInfo &DCI, SDValue Cond,
+ SDValue TrueVal, SDValue FalseVal) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
>From 95aa98207859429b88b315c0be5e9d42aea6c341 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 21 Jan 2026 12:08:56 +0800
Subject: [PATCH 9/9] fix comment
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3585836cdbaac..8fddca4cad6b6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17078,12 +17078,14 @@ SDValue SITargetLowering::castTypeSelect(SDNode *N, DAGCombinerInfo &DCI,
EVT EltVT = ResultVT.getVectorElementType();
unsigned EltBitSize = EltVT.getSizeInBits();
- ElementCount NumElts = ResultVT.getVectorElementCount();
- if (!EltVT.isInteger() || !isPowerOf2_32(EltBitSize) || NumElts.isScalar())
+ if (!EltVT.isInteger() || ResultVT.getVectorElementCount().isScalar())
return SDValue();
unsigned NewNumElts = ResultVT.getVectorNumElements() / (32 / EltBitSize);
+ if (NewNumElts * 32 != EltBitSize * ResultVT.getVectorNumElements())
+ return SDValue();
+
EVT NewVT = EVT::getVectorVT(*DCI.DAG.getContext(), MVT::i32, NewNumElts);
SDValue NewTrue =
DCI.DAG.getNode(ISD::BITCAST, SDLoc(TrueVal), NewVT, TrueVal);
More information about the llvm-commits
mailing list