[llvm] Do not use PerformEXTRACTCombine for v8i8 types (PR #81242)
Adrian Kuegel via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 9 02:31:57 PST 2024
https://github.com/akuegel created https://github.com/llvm/llvm-project/pull/81242
Same as with v4i8 types, we should not be using PerformEXTRACTCombine for v8i8 types.
>From 3014feecf6aa5b572063187c79f9204762fdd026 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel at google.com>
Date: Fri, 9 Feb 2024 11:20:57 +0100
Subject: [PATCH 1/2] Update extractelement.ll
---
llvm/test/CodeGen/NVPTX/extractelement.ll | 92 ++++++++++++-----------
1 file changed, 47 insertions(+), 45 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index ed7dd45ab7b450..92d00f881cc261 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -3,9 +3,9 @@
; CHECK-LABEL: test_v2i8
-; CHECK-DAG: ld.param.u16 [[A:%rs[0-9+]]], [test_v2i8_param_0];
-; CHECK-DAG: cvt.s16.s8 [[E0:%rs[0-9+]]], [[A]];
-; CHECK-DAG: shr.s16 [[E1:%rs[0-9+]]], [[A]], 8;
+; CHECK-DAG: ld.param.u16 [[A:%rs[0-9]+]], [test_v2i8_param_0];
+; CHECK-DAG: cvt.s16.s8 [[E0:%rs[0-9]+]], [[A]];
+; CHECK-DAG: shr.s16 [[E1:%rs[0-9]+]], [[A]], 8;
define i16 @test_v2i8(i16 %a) {
%v = bitcast i16 %a to <2 x i8>
%r0 = extractelement <2 x i8> %v, i64 0
@@ -17,15 +17,15 @@ define i16 @test_v2i8(i16 %a) {
}
; CHECK-LABEL: test_v4i8
-; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_param_0];
-; CHECK-DAG: bfe.s32 [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9+]]], [[R0]];
-; CHECK-DAG: bfe.s32 [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9+]]], [[R1]];
-; CHECK-DAG: bfe.s32 [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG: cvt.s8.s32 [[E2:%rs[0-9+]]], [[R2]];
-; CHECK-DAG: bfe.s32 [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG: cvt.s8.s32 [[E3:%rs[0-9+]]], [[R3]];
+; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0];
+; CHECK-DAG: bfe.s32 [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9]+]], [[R0]];
+; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9]+]], [[R1]];
+; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG: cvt.s8.s32 [[E2:%rs[0-9]+]], [[R2]];
+; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG: cvt.s8.s32 [[E3:%rs[0-9]+]], [[R3]];
define i16 @test_v4i8(i32 %a) {
%v = bitcast i32 %a to <4 x i8>
%r0 = extractelement <4 x i8> %v, i64 0
@@ -43,14 +43,14 @@ define i16 @test_v4i8(i32 %a) {
}
; CHECK-LABEL: test_v4i8_s32
-; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_s32_param_0];
-; CHECK-DAG: bfe.s32 [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG: bfe.s32 [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG: bfe.s32 [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG: bfe.s32 [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG: add.s32 [[R01:%r[0-9+]]], [[R0]], [[R1]]
-; CHECK-DAG: add.s32 [[R23:%r[0-9+]]], [[R2]], [[R3]]
-; CHECK-DAG: add.s32 [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_s32_param_0];
+; CHECK-DAG: bfe.s32 [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG: add.s32 [[R01:%r[0-9]+]], [[R0]], [[R1]]
+; CHECK-DAG: add.s32 [[R23:%r[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG: add.s32 [[R0123:%r[0-9]+]], [[R01]], [[R23]]
define i32 @test_v4i8_s32(i32 %a) {
%v = bitcast i32 %a to <4 x i8>
%r0 = extractelement <4 x i8> %v, i64 0
@@ -68,14 +68,14 @@ define i32 @test_v4i8_s32(i32 %a) {
}
; CHECK-LABEL: test_v4i8_u32
-; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_u32_param_0];
-; CHECK-DAG: bfe.u32 [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG: bfe.u32 [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG: bfe.u32 [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG: bfe.u32 [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG: add.s32 [[R01:%r[0-9+]]], [[R0]], [[R1]]
-; CHECK-DAG: add.s32 [[R23:%r[0-9+]]], [[R2]], [[R3]]
-; CHECK-DAG: add.s32 [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_u32_param_0];
+; CHECK-DAG: bfe.u32 [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG: bfe.u32 [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG: bfe.u32 [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG: bfe.u32 [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG: add.s32 [[R01:%r[0-9]+]], [[R0]], [[R1]]
+; CHECK-DAG: add.s32 [[R23:%r[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG: add.s32 [[R0123:%r[0-9]+]], [[R01]], [[R23]]
define i32 @test_v4i8_u32(i32 %a) {
%v = bitcast i32 %a to <4 x i8>
%r0 = extractelement <4 x i8> %v, i64 0
@@ -95,23 +95,25 @@ define i32 @test_v4i8_u32(i32 %a) {
; CHECK-LABEL: test_v8i8
-; CHECK: ld.param.u64 [[R:%rd[0-9+]]], [test_v8i8_param_0];
-; CHECK-DAG: cvt.s8.s64 [[E0:%rs[0-9+]]], [[R]];
-; Element 1 is still extracted by trunc, shr 8, not sure why.
-; CHECK-DAG: cvt.u16.u64 [[R01:%rs[0-9+]]], [[R]];
-; CHECK-DAG: shr.s16 [[E1:%rs[0-9+]]], [[R01]], 8;
-; CHECK-DAG: bfe.s64 [[RD2:%rd[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG: cvt.s8.s64 [[E2:%rs[0-9+]]], [[RD2]];
-; CHECK-DAG: bfe.s64 [[RD3:%rd[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG: cvt.s8.s64 [[E3:%rs[0-9+]]], [[RD3]];
-; CHECK-DAG: bfe.s64 [[RD4:%rd[0-9+]]], [[R]], 32, 8;
-; CHECK-DAG: cvt.s8.s64 [[E4:%rs[0-9+]]], [[RD4]];
-; CHECK-DAG: bfe.s64 [[RD5:%rd[0-9+]]], [[R]], 40, 8;
-; CHECK-DAG: cvt.s8.s64 [[E5:%rs[0-9+]]], [[RD5]];
-; CHECK-DAG: bfe.s64 [[RD6:%rd[0-9+]]], [[R]], 48, 8;
-; CHECK-DAG: cvt.s8.s64 [[E6:%rs[0-9+]]], [[RD6]];
-; CHECK-DAG: bfe.s64 [[RD7:%rd[0-9+]]], [[R]], 56, 8;
-; CHECK-DAG: cvt.s8.s64 [[E7:%rs[0-9+]]], [[RD7]];
+; CHECK: ld.param.u64 [[R:%rd[0-9]+]], [test_v8i8_param_0];
+; CHECK-DAG: cvt.u32.u64 [[R00:%r[0-9]+]], [[R]];
+; CHECK-DAG: { .reg .b32 tmp; mov.b64 {tmp, [[R01:%r[0-9]+]]}, [[R]]; }
+; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R00]], 0, 8;
+; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9]+]], [[R1]];
+; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R00]], 8, 8;
+; CHECK-DAG: cvt.s8.s32 [[E2:%rs[0-9]+]], [[R2]];
+; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R00]], 16, 8;
+; CHECK-DAG: cvt.s8.s32 [[E3:%rs[0-9]+]], [[R3]];
+; CHECK-DAG: bfe.s32 [[R4:%r[0-9]+]], [[R00]], 24, 8;
+; CHECK-DAG: cvt.s8.s32 [[E4:%rs[0-9]+]], [[R4]];
+; CHECK-DAG: bfe.s32 [[R5:%r[0-9]+]], [[R01]], 0, 8;
+; CHECK-DAG: cvt.s8.s32 [[E5:%rs[0-9]+]], [[R5]];
+; CHECK-DAG: bfe.s32 [[R6:%r[0-9]+]], [[R01]], 8, 8;
+; CHECK-DAG: cvt.s8.s32 [[E6:%rs[0-9]+]], [[R6]];
+; CHECK-DAG: bfe.s32 [[R7:%r[0-9]+]], [[R01]], 16, 8;
+; CHECK-DAG: cvt.s8.s32 [[E7:%rs[0-9]+]], [[R7]];
+; CHECK-DAG: bfe.s32 [[R8:%r[0-9]+]], [[R01]], 24, 8;
+; CHECK-DAG: cvt.s8.s32 [[E8:%rs[0-9]+]], [[R8]];
define i16 @test_v8i8(i64 %a) {
%v = bitcast i64 %a to <8 x i8>
>From 58c8700d2b4586a8b244dc9ecfa9dacd3f9f95b5 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel at google.com>
Date: Fri, 9 Feb 2024 11:30:29 +0100
Subject: [PATCH 2/2] Do not use PerformEXTRACTCombine for v8i8 types
Same as with v4i8, we should not be using PerformEXTRACTCombine for v8i8 types.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c7bc623a88e1b9..5c24f00dbca0e2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5354,10 +5354,11 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
IsPTXVectorType(VectorVT.getSimpleVT()))
return SDValue(); // Native vector loads already combine nicely w/
- // extract_vector_elt, except for v4i8.
- // Don't mess with singletons or v2*16 types, we already handle them OK.
+ // extract_vector_elt.
+ // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
+ // handle them OK.
if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
- VectorVT == MVT::v4i8)
+ VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
return SDValue();
uint64_t VectorBits = VectorVT.getSizeInBits();
More information about the llvm-commits
mailing list