[llvm] [LLVM][DAGCombiner] Port calculateByteProvider to TypeSize. (PR #148425)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 14 04:53:57 PDT 2025
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/148425
>From 1336f121260688d4263093e2a39d7ee65c33d181 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Sun, 13 Jul 2025 12:12:51 +0100
Subject: [PATCH 1/2] Add scalable vector tests to exercise
calculateByteProvider.
NOTE: Prior to this PR these tests will trigger "Invalid size request on
a scalable vector" errors. The output shows the result of simply
bailing out for scalable vectors.
---
llvm/test/CodeGen/AArch64/load-combine.ll | 203 ++++++++++++++++++++++
1 file changed, 203 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll
index 57f61e5303ecf..4cf007318806c 100644
--- a/llvm/test/CodeGen/AArch64/load-combine.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine.ll
@@ -713,3 +713,206 @@ define void @short_vector_to_i64(ptr %in, ptr %out, ptr %p) {
store i64 %i3, ptr %out
ret void
}
+
+; x1 = x0
+define void @scalable_vector_to_i32(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: mov w9, v0.s[2]
+; CHECK-NEXT: mov w11, v0.s[3]
+; CHECK-NEXT: orr w8, w10, w8, lsl #8
+; CHECK-NEXT: orr w8, w8, w9, lsl #16
+; CHECK-NEXT: orr w8, w8, w11, lsl #24
+; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: ret
+ %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+ %e1 = extractelement <vscale x 4 x i8> %ld, i32 0
+ %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+ %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+ %e4 = extractelement <vscale x 4 x i8> %ld, i32 3
+
+ %z0 = zext i8 %e1 to i32
+ %z1 = zext i8 %e2 to i32
+ %z2 = zext i8 %e3 to i32
+ %z3 = zext i8 %e4 to i32
+
+ %s1 = shl nuw nsw i32 %z1, 8
+ %s2 = shl nuw nsw i32 %z2, 16
+ %s3 = shl nuw i32 %z3, 24
+
+ %i1 = or i32 %s1, %z0
+ %i2 = or i32 %i1, %s2
+ %i3 = or i32 %i2, %s3
+
+ store i32 %i3, ptr %out
+ ret void
+}
+
+define void @scalable_vector_to_i32_unused_low_i8(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32_unused_low_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: mov w9, v0.s[2]
+; CHECK-NEXT: mov w10, v0.s[3]
+; CHECK-NEXT: lsl w8, w8, #8
+; CHECK-NEXT: orr w8, w8, w9, lsl #16
+; CHECK-NEXT: orr w8, w8, w10, lsl #24
+; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: ret
+ %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+ %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+ %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+ %e4 = extractelement <vscale x 4 x i8> %ld, i32 3
+
+ %z1 = zext i8 %e2 to i32
+ %z2 = zext i8 %e3 to i32
+ %z3 = zext i8 %e4 to i32
+
+ %s1 = shl nuw nsw i32 %z1, 8
+ %s2 = shl nuw nsw i32 %z2, 16
+ %s3 = shl nuw i32 %z3, 24
+
+ %i2 = or i32 %s1, %s2
+ %i3 = or i32 %i2, %s3
+
+ store i32 %i3, ptr %out
+ ret void
+}
+
+define void @scalable_vector_to_i32_unused_high_i8(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32_unused_high_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: mov w9, v0.s[2]
+; CHECK-NEXT: orr w8, w10, w8, lsl #8
+; CHECK-NEXT: orr w8, w8, w9, lsl #16
+; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: ret
+ %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+ %e1 = extractelement <vscale x 4 x i8> %ld, i32 0
+ %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+ %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+
+ %z0 = zext i8 %e1 to i32
+ %z1 = zext i8 %e2 to i32
+ %z2 = zext i8 %e3 to i32
+
+ %s1 = shl nuw nsw i32 %z1, 8
+ %s2 = shl nuw nsw i32 %z2, 16
+
+ %i1 = or i32 %s1, %z0
+ %i2 = or i32 %i1, %s2
+
+ store i32 %i2, ptr %out
+ ret void
+}
+
+define void @scalable_vector_to_i32_unused_low_i16(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32_unused_low_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov w8, v0.s[2]
+; CHECK-NEXT: mov w9, v0.s[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: orr w8, w8, w9, lsl #24
+; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: ret
+ %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+ %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+ %e4 = extractelement <vscale x 4 x i8> %ld, i32 3
+
+ %z2 = zext i8 %e3 to i32
+ %z3 = zext i8 %e4 to i32
+
+ %s2 = shl nuw nsw i32 %z2, 16
+ %s3 = shl nuw i32 %z3, 24
+
+ %i3 = or i32 %s2, %s3
+
+ store i32 %i3, ptr %out
+ ret void
+}
+
+; x1 = x0[0:1]
+define void @scalable_vector_to_i32_unused_high_i16(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32_unused_high_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: orr w8, w9, w8, lsl #8
+; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: ret
+ %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+ %e1 = extractelement <vscale x 4 x i8> %ld, i32 0
+ %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+
+ %z0 = zext i8 %e1 to i32
+ %z1 = zext i8 %e2 to i32
+
+ %s1 = shl nuw nsw i32 %z1, 8
+
+ %i1 = or i32 %s1, %z0
+
+ store i32 %i1, ptr %out
+ ret void
+}
+
+; x1 = x0
+define void @scalable_vector_to_i64(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w11, s0
+; CHECK-NEXT: mov w9, v0.s[2]
+; CHECK-NEXT: mov w10, v0.s[3]
+; CHECK-NEXT: and x11, x11, #0xff
+; CHECK-NEXT: bfi x11, x8, #8, #8
+; CHECK-NEXT: lsl w8, w10, #24
+; CHECK-NEXT: bfi x11, x9, #16, #8
+; CHECK-NEXT: orr x8, x11, x8
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: ret
+ %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+ %e1 = extractelement <vscale x 4 x i8> %ld, i32 0
+ %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+ %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+ %e4 = extractelement <vscale x 4 x i8> %ld, i32 3
+
+ %z0 = zext i8 %e1 to i64
+ %z1 = zext i8 %e2 to i64
+ %z2 = zext i8 %e3 to i64
+ %z3 = zext i8 %e4 to i64
+
+ %s1 = shl nuw nsw i64 %z1, 8
+ %s2 = shl nuw nsw i64 %z2, 16
+ %s3 = shl nuw i64 %z3, 24
+
+ %i1 = or i64 %s1, %z0
+ %i2 = or i64 %i1, %s2
+ %i3 = or i64 %i2, %s3
+
+ store i64 %i3, ptr %out
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
>From e280b6bb27470275aa730cfa1979b80305d78e42 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Sun, 13 Jul 2025 12:41:18 +0100
Subject: [PATCH 2/2] [LLVM][DAGCombiner] Port calculateByteProvider to
TypeSize.
Fixes https://github.com/llvm/llvm-project/issues/148387
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 +++----
llvm/test/CodeGen/AArch64/load-combine.ll | 36 ++++---------------
2 files changed, 13 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 231184587d682..c93f79bfc6dee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9149,11 +9149,12 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
if (Op.getOpcode() != ISD::LOAD && VectorIndex.has_value())
return std::nullopt;
- unsigned BitWidth = Op.getValueSizeInBits();
- if (BitWidth % 8 != 0)
+ TypeSize BitWidth = Op.getValueSizeInBits();
+ if (!BitWidth.isKnownMultipleOf(8))
return std::nullopt;
- unsigned ByteWidth = BitWidth / 8;
- assert(Index < ByteWidth && "invalid index requested");
+ TypeSize ByteWidth = BitWidth.divideCoefficientBy(8);
+ assert(TypeSize::isKnownLT(TypeSize::getFixed(Index), ByteWidth) &&
+ "invalid index requested");
(void) ByteWidth;
switch (Op.getOpcode()) {
@@ -9220,7 +9221,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
VectorIndex = OffsetOp->getZExtValue();
SDValue NarrowOp = Op->getOperand(0);
- unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+ uint64_t NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
if (NarrowBitWidth % 8 != 0)
return std::nullopt;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;
@@ -9248,7 +9249,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
if (!L->isSimple() || L->isIndexed())
return std::nullopt;
- unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
+ uint64_t NarrowBitWidth = L->getMemoryVT().getScalarSizeInBits();
if (NarrowBitWidth % 8 != 0)
return std::nullopt;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;
diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll
index 4cf007318806c..0bf4fb1063025 100644
--- a/llvm/test/CodeGen/AArch64/load-combine.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine.ll
@@ -718,15 +718,7 @@ define void @short_vector_to_i64(ptr %in, ptr %out, ptr %p) {
define void @scalable_vector_to_i32(ptr %in, ptr %out, ptr %p) #0 {
; CHECK-LABEL: scalable_vector_to_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: mov w11, v0.s[3]
-; CHECK-NEXT: orr w8, w10, w8, lsl #8
-; CHECK-NEXT: orr w8, w8, w9, lsl #16
-; CHECK-NEXT: orr w8, w8, w11, lsl #24
+; CHECK-NEXT: ldr w8, [x0]
; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
%ld = load <vscale x 4 x i8>, ptr %in, align 4
@@ -791,12 +783,10 @@ define void @scalable_vector_to_i32_unused_high_i8(ptr %in, ptr %out, ptr %p) #0
; CHECK-LABEL: scalable_vector_to_i32_unused_high_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: orr w8, w10, w8, lsl #8
-; CHECK-NEXT: orr w8, w8, w9, lsl #16
+; CHECK-NEXT: mov w8, v0.s[2]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
%ld = load <vscale x 4 x i8>, ptr %in, align 4
@@ -851,11 +841,7 @@ define void @scalable_vector_to_i32_unused_low_i16(ptr %in, ptr %out, ptr %p) #0
define void @scalable_vector_to_i32_unused_high_i16(ptr %in, ptr %out, ptr %p) #0 {
; CHECK-LABEL: scalable_vector_to_i32_unused_high_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: orr w8, w9, w8, lsl #8
+; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
%ld = load <vscale x 4 x i8>, ptr %in, align 4
@@ -878,17 +864,7 @@ define void @scalable_vector_to_i32_unused_high_i16(ptr %in, ptr %out, ptr %p) #
define void @scalable_vector_to_i64(ptr %in, ptr %out, ptr %p) #0 {
; CHECK-LABEL: scalable_vector_to_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w11, s0
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: mov w10, v0.s[3]
-; CHECK-NEXT: and x11, x11, #0xff
-; CHECK-NEXT: bfi x11, x8, #8, #8
-; CHECK-NEXT: lsl w8, w10, #24
-; CHECK-NEXT: bfi x11, x9, #16, #8
-; CHECK-NEXT: orr x8, x11, x8
+; CHECK-NEXT: ldr w8, [x0]
; CHECK-NEXT: str x8, [x1]
; CHECK-NEXT: ret
%ld = load <vscale x 4 x i8>, ptr %in, align 4
More information about the llvm-commits
mailing list