[llvm] [AArch64][SVE] Avoid redundant extend of unsigned i8/i16 extracts. (PR #165863)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 7 08:28:31 PST 2025
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/165863
>From 2fcfa822789d6f605bbbedc2bcb5af03c11c092f Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 1 Oct 2025 04:16:27 -0700
Subject: [PATCH 1/3] Add tests.
---
.../CodeGen/AArch64/sve-extract-element.ll | 140 ++++++++++++++++++
1 file changed, 140 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index c340df1385124..f91fbf11223c2 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -12,6 +12,28 @@ define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 {
ret i8 %b
}
+define i32 @test_lane0_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane0_16xi8_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 0
+ %c = zext i8 %b to i32
+ ret i32 %c
+}
+
+define i64 @test_lane0_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane0_16xi8_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and x0, x8, #0xff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 0
+ %c = zext i8 %b to i64
+ ret i64 %c
+}
+
define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane15_16xi8:
; CHECK: // %bb.0:
@@ -21,6 +43,28 @@ define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
ret i8 %b
}
+define i32 @test_lane15_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w8, v0.b[15]
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 15
+ %c = zext i8 %b to i32
+ ret i32 %c
+}
+
+define i64 @test_lane15_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w8, v0.b[15]
+; CHECK-NEXT: and x0, x8, #0xff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 15
+ %c = zext i8 %b to i64
+ ret i64 %c
+}
+
define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane16_16xi8:
; CHECK: // %bb.0:
@@ -31,6 +75,32 @@ define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
ret i8 %b
}
+; FIXME: FMOV+AND -> UMOV.
+define i32 @test_lane16_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.b, z0.b[16]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 16
+ %c = zext i8 %b to i32
+ ret i32 %c
+}
+
+; FIXME: FMOV+AND -> UMOV.
+define i64 @test_lane16_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.b, z0.b[16]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and x0, x8, #0xff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 16
+ %c = zext i8 %b to i64
+ ret i64 %c
+}
+
define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane0_8xi16:
; CHECK: // %bb.0:
@@ -40,6 +110,28 @@ define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
ret i16 %b
}
+define i32 @test_lane0_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane0_8xi16_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 0
+ %c = zext i16 %b to i32
+ ret i32 %c
+}
+
+define i64 @test_lane0_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane0_8xi16_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and x0, x8, #0xffff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 0
+ %c = zext i16 %b to i64
+ ret i64 %c
+}
+
define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane7_8xi16:
; CHECK: // %bb.0:
@@ -49,6 +141,28 @@ define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
ret i16 %b
}
+define i32 @test_lane7_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w8, v0.h[7]
+; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 7
+ %c = zext i16 %b to i32
+ ret i32 %c
+}
+
+define i64 @test_lane7_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w8, v0.h[7]
+; CHECK-NEXT: and x0, x8, #0xffff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 7
+ %c = zext i16 %b to i64
+ ret i64 %c
+}
+
define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane8_8xi16:
; CHECK: // %bb.0:
@@ -59,6 +173,32 @@ define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
ret i16 %b
}
+; FIXME: FMOV+AND -> UMOV.
+define i32 @test_lane8_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, z0.h[8]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 8
+ %c = zext i16 %b to i32
+ ret i32 %c
+}
+
+; FIXME: FMOV+AND -> UMOV.
+define i64 @test_lane8_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, z0.h[8]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and x0, x8, #0xffff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 8
+ %c = zext i16 %b to i64
+ ret i64 %c
+}
+
define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: test_lane0_4xi32:
; CHECK: // %bb.0:
>From 718ce42a5da9a97fa04ce07f932834e89e607677 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 1 Oct 2025 04:16:43 -0700
Subject: [PATCH 2/3] [AArch64][SVE] Avoid redundant extend for unsigned i8/i16
extracts.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Extracts of unsigned i8 or i16 elements from the the bottom 128-bits of
a scalable register lead to the zero-extend being transformed to an AND
mask. The mask is redundant since UMOV already zeroes the high bits of
the destination register.
For example:
```c
int foo(svuint8_t x) {
return x[3];
}
```
Currently:
```gas
foo:
umov w8, v0.b[3]
and w0, w8, #0xff
ret
```
Becomes:
```
foo:
umov w0, v0.b[3]
ret
```
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 13 ++++++++++
.../CodeGen/AArch64/sve-extract-element.ll | 24 +++++++------------
2 files changed, 21 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..6933303037716 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3592,6 +3592,19 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
(SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
+
+ // Extracts of ``unsigned'' i8 or i16 elements lead to the zero-extend being
+ // transformed to an AND mask. The mask is redundant since UMOV already zeroes
+ // the high bits of the destination register.
+ // We do something similar in the Neon versions of these patterns.
+ def : Pat<(i32 (and (vector_extract nxv16i8:$vec, VectorIndexB:$index), 0xff)),
+ (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
+ def : Pat<(i32 (and (vector_extract nxv8i16:$vec, VectorIndexH:$index), 0xffff)),
+ (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
+ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)))), (i64 0xff))),
+ (SUBREG_TO_REG (i64 0), (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)), sub_32)>;
+ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)))), (i64 0xffff))),
+ (SUBREG_TO_REG (i64 0), (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)), sub_32)>;
} // End HasNEON
// Extract first element from vector.
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index f91fbf11223c2..0cc2e04bfb315 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -15,8 +15,7 @@ define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 {
define i32 @test_lane0_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane0_16xi8_zext_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: umov w0, v0.b[0]
; CHECK-NEXT: ret
%b = extractelement <vscale x 16 x i8> %a, i32 0
%c = zext i8 %b to i32
@@ -26,8 +25,7 @@ define i32 @test_lane0_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
define i64 @test_lane0_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane0_16xi8_zext_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and x0, x8, #0xff
+; CHECK-NEXT: umov w0, v0.b[0]
; CHECK-NEXT: ret
%b = extractelement <vscale x 16 x i8> %a, i32 0
%c = zext i8 %b to i64
@@ -46,8 +44,7 @@ define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
define i32 @test_lane15_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane15_16xi8_zext_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: umov w8, v0.b[15]
-; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: umov w0, v0.b[15]
; CHECK-NEXT: ret
%b = extractelement <vscale x 16 x i8> %a, i32 15
%c = zext i8 %b to i32
@@ -57,8 +54,7 @@ define i32 @test_lane15_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
define i64 @test_lane15_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane15_16xi8_zext_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: umov w8, v0.b[15]
-; CHECK-NEXT: and x0, x8, #0xff
+; CHECK-NEXT: umov w0, v0.b[15]
; CHECK-NEXT: ret
%b = extractelement <vscale x 16 x i8> %a, i32 15
%c = zext i8 %b to i64
@@ -113,8 +109,7 @@ define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
define i32 @test_lane0_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane0_8xi16_zext_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
%b = extractelement <vscale x 8 x i16> %a, i32 0
%c = zext i16 %b to i32
@@ -124,8 +119,7 @@ define i32 @test_lane0_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
define i64 @test_lane0_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane0_8xi16_zext_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and x0, x8, #0xffff
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
%b = extractelement <vscale x 8 x i16> %a, i32 0
%c = zext i16 %b to i64
@@ -144,8 +138,7 @@ define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
define i32 @test_lane7_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane7_8xi16_zext_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: umov w8, v0.h[7]
-; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: umov w0, v0.h[7]
; CHECK-NEXT: ret
%b = extractelement <vscale x 8 x i16> %a, i32 7
%c = zext i16 %b to i32
@@ -155,8 +148,7 @@ define i32 @test_lane7_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
define i64 @test_lane7_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane7_8xi16_zext_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: umov w8, v0.h[7]
-; CHECK-NEXT: and x0, x8, #0xffff
+; CHECK-NEXT: umov w0, v0.h[7]
; CHECK-NEXT: ret
%b = extractelement <vscale x 8 x i16> %a, i32 7
%c = zext i16 %b to i64
>From 107cffb2d36cc39a5178bb3486bfe181a29c2bc4 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 7 Nov 2025 08:24:13 -0800
Subject: [PATCH 3/3] Remove unnecessary comment.
---
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 6933303037716..1dc33e9f65142 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3596,7 +3596,6 @@ let Predicates = [HasSVE_or_SME] in {
// Extracts of ``unsigned'' i8 or i16 elements lead to the zero-extend being
// transformed to an AND mask. The mask is redundant since UMOV already zeroes
// the high bits of the destination register.
- // We do something similar in the Neon versions of these patterns.
def : Pat<(i32 (and (vector_extract nxv16i8:$vec, VectorIndexB:$index), 0xff)),
(UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
def : Pat<(i32 (and (vector_extract nxv8i16:$vec, VectorIndexH:$index), 0xffff)),
More information about the llvm-commits
mailing list