[llvm] [RISCV] Add packw+packh isel pattern for unaligned loads on RV64. (PR #152159)

Tue Aug 5 11:37:54 PDT 2025

https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/152159

>From c579d41876651d2e09ad0d5a94db7575b42e485c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 5 Aug 2025 10:45:48 -0700
Subject: [PATCH 1/2] Pre-commit tests.

---
 llvm/test/CodeGen/RISCV/rv64zbkb.ll | 126 ++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 818ea723ca2e1..6584588e68f13 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -392,3 +392,129 @@ define i64 @zext_i16_to_i64(i16 %a) nounwind {
   %1 = zext i16 %a to i64
   ret i64 %1
 }
+
+define void @pack_lo_packh_hi_packh(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2, i8 zeroext %3, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_packh_hi_packh:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    or a2, a2, a3
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    sw a0, 0(a4)
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: pack_lo_packh_hi_packh:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    slli a3, a3, 24
+; RV64ZBKB-NEXT:    packh a0, a0, a1
+; RV64ZBKB-NEXT:    packw a0, a0, a2
+; RV64ZBKB-NEXT:    or a0, a0, a3
+; RV64ZBKB-NEXT:    sw a0, 0(a4)
+; RV64ZBKB-NEXT:    ret
+  %a = zext i8 %0 to i32
+  %b = zext i8 %1 to i32
+  %c = zext i8 %2 to i32
+  %d = zext i8 %3 to i32
+  %e = shl i32 %b, 8
+  %f = shl i32 %c, 16
+  %g = shl i32 %d, 24
+  %h = or i32 %a, %e
+  %i = or i32 %h, %f
+  %j = or i32 %i, %g
+  store i32 %j, ptr %p
+  ret void
+}
+
+define void @pack_lo_packh_hi_packh_2(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2, i8 zeroext %3, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_packh_hi_packh_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    or a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    sw a0, 0(a4)
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: pack_lo_packh_hi_packh_2:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    slli a2, a2, 16
+; RV64ZBKB-NEXT:    slli a3, a3, 24
+; RV64ZBKB-NEXT:    packh a0, a0, a1
+; RV64ZBKB-NEXT:    or a0, a3, a0
+; RV64ZBKB-NEXT:    or a0, a2, a0
+; RV64ZBKB-NEXT:    sw a0, 0(a4)
+; RV64ZBKB-NEXT:    ret
+  %a = zext i8 %0 to i32
+  %b = zext i8 %1 to i32
+  %c = zext i8 %2 to i32
+  %d = zext i8 %3 to i32
+  %e = shl i32 %b, 8
+  %f = shl i32 %c, 16
+  %g = shl i32 %d, 24
+  %h = or i32 %a, %e
+  %i = or i32 %g, %h
+  %j = or i32 %f, %i
+  store i32 %j, ptr %p
+  ret void
+}
+
+define void @pack_lo_zext_hi_packh(i16 zeroext %0, i8 zeroext %1, i8 zeroext %2, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_zext_hi_packh:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a2, 16
+; RV64I-NEXT:    slli a2, a2, 24
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sw a0, 0(a3)
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: pack_lo_zext_hi_packh:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    packh a1, a2, a2
+; RV64ZBKB-NEXT:    slli a1, a1, 16
+; RV64ZBKB-NEXT:    or a0, a1, a0
+; RV64ZBKB-NEXT:    sw a0, 0(a3)
+; RV64ZBKB-NEXT:    ret
+  %a = zext i16 %0 to i32
+  %b = zext i8 %1 to i32
+  %c = zext i8 %2 to i32
+  %d = shl i32 %c, 8
+  %e = or i32 %c, %d
+  %f = shl i32 %e, 16
+  %g = or i32 %f, %a
+  store i32 %g, ptr %p
+  ret void
+}
+
+; Negative test, %a isn't extended so we can't use packw for the outer or, but
+; we can use packh for the high half.
+define void @pack_lo_noext_hi_packh(i32 %a, i8 zeroext %1, i8 zeroext %2, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_noext_hi_packh:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a2, 16
+; RV64I-NEXT:    slli a2, a2, 24
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sw a0, 0(a3)
+; RV64I-NEXT:    ret
+;
+; RV64ZBKB-LABEL: pack_lo_noext_hi_packh:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    packh a1, a2, a2
+; RV64ZBKB-NEXT:    slli a1, a1, 16
+; RV64ZBKB-NEXT:    or a0, a1, a0
+; RV64ZBKB-NEXT:    sw a0, 0(a3)
+; RV64ZBKB-NEXT:    ret
+  %b = zext i8 %1 to i32
+  %c = zext i8 %2 to i32
+  %d = shl i32 %c, 8
+  %e = or i32 %c, %d
+  %f = shl i32 %e, 16
+  %g = or i32 %f, %a
+  store i32 %g, ptr %p
+  ret void
+}

>From cd8f2b4ab1e7deb3d1aeea9d101bd9674f3f5147 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 4 Aug 2025 23:40:11 -0700
Subject: [PATCH 2/2] [RISCV] Add packw+packh isel pattern for unaligned loads
 on RV64.

This is similar to an existing pattern from RV32 with the simpliflication
proposed by #152045. Instead of pack we need to use packw and we
need to know that the upper 32 bits are being ignored since packw
sign extends from bit 31.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td     | 21 +++++++++++++++++++
 llvm/test/CodeGen/RISCV/rv64zbkb.ll           | 14 +++++--------
 .../CodeGen/RISCV/unaligned-load-store.ll     | 20 ++++++++----------
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index d2a651444169c..a37dcde68aa38 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -661,6 +661,27 @@ def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)),
 def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
                    (zexti16 (i64 GPR:$rs1)))),
           (PACKW GPR:$rs1, GPR:$rs2)>;
+
+// Match a pattern of 2 bytes being inserted into bits [31:16], with bits
+// bits [15:0] coming from a zero extended value, and bits [63:32] being
+// ignored. We can use packw with packh for bits [31:16]. If bits [15:0] can
+// also be a packh, it can be matched separately.
+def : Pat<(binop_allwusers<or>
+               (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)),
+                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+               (zexti16 (XLenVT GPR:$rs1))),
+          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+// We need to manually reassociate the patterns because of the binop_allwusers.
+def : Pat<(binop_allwusers<or>
+               (or (zexti16 (XLenVT GPR:$rs1)),
+                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+               (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24))),
+          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+def : Pat<(binop_allwusers<or>
+               (or (zexti16 (XLenVT GPR:$rs1)),
+                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 24))),
+               (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
+          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
 } // Predicates = [HasStdExtZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbb, IsRV32] in
diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 6584588e68f13..37c9eaea6f70b 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -407,10 +407,9 @@ define void @pack_lo_packh_hi_packh(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2,
 ;
 ; RV64ZBKB-LABEL: pack_lo_packh_hi_packh:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    slli a3, a3, 24
 ; RV64ZBKB-NEXT:    packh a0, a0, a1
-; RV64ZBKB-NEXT:    packw a0, a0, a2
-; RV64ZBKB-NEXT:    or a0, a0, a3
+; RV64ZBKB-NEXT:    packh a1, a2, a3
+; RV64ZBKB-NEXT:    packw a0, a0, a1
 ; RV64ZBKB-NEXT:    sw a0, 0(a4)
 ; RV64ZBKB-NEXT:    ret
   %a = zext i8 %0 to i32
@@ -441,11 +440,9 @@ define void @pack_lo_packh_hi_packh_2(i8 zeroext %0, i8 zeroext %1, i8 zeroext %
 ;
 ; RV64ZBKB-LABEL: pack_lo_packh_hi_packh_2:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    slli a2, a2, 16
-; RV64ZBKB-NEXT:    slli a3, a3, 24
 ; RV64ZBKB-NEXT:    packh a0, a0, a1
-; RV64ZBKB-NEXT:    or a0, a3, a0
-; RV64ZBKB-NEXT:    or a0, a2, a0
+; RV64ZBKB-NEXT:    packh a1, a3, a2
+; RV64ZBKB-NEXT:    packw a0, a0, a1
 ; RV64ZBKB-NEXT:    sw a0, 0(a4)
 ; RV64ZBKB-NEXT:    ret
   %a = zext i8 %0 to i32
@@ -475,8 +472,7 @@ define void @pack_lo_zext_hi_packh(i16 zeroext %0, i8 zeroext %1, i8 zeroext %2,
 ; RV64ZBKB-LABEL: pack_lo_zext_hi_packh:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    packh a1, a2, a2
-; RV64ZBKB-NEXT:    slli a1, a1, 16
-; RV64ZBKB-NEXT:    or a0, a1, a0
+; RV64ZBKB-NEXT:    packw a0, a0, a1
 ; RV64ZBKB-NEXT:    sw a0, 0(a3)
 ; RV64ZBKB-NEXT:    ret
   %a = zext i16 %0 to i32
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index c9c49e8f7f532..cb046cdaae75c 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -204,18 +204,16 @@ define i64 @load_i64(ptr %p) {
 ; RV64IZBKB-NEXT:    lbu a2, 5(a0)
 ; RV64IZBKB-NEXT:    lbu a3, 6(a0)
 ; RV64IZBKB-NEXT:    lbu a4, 7(a0)
-; RV64IZBKB-NEXT:    lbu a5, 0(a0)
-; RV64IZBKB-NEXT:    lbu a6, 1(a0)
-; RV64IZBKB-NEXT:    lbu a7, 2(a0)
-; RV64IZBKB-NEXT:    lbu a0, 3(a0)
+; RV64IZBKB-NEXT:    lbu a5, 1(a0)
+; RV64IZBKB-NEXT:    lbu a6, 2(a0)
+; RV64IZBKB-NEXT:    lbu a7, 3(a0)
+; RV64IZBKB-NEXT:    lbu a0, 0(a0)
+; RV64IZBKB-NEXT:    packh a3, a3, a4
 ; RV64IZBKB-NEXT:    packh a1, a1, a2
-; RV64IZBKB-NEXT:    packh a2, a3, a4
-; RV64IZBKB-NEXT:    packh a3, a5, a6
-; RV64IZBKB-NEXT:    packh a0, a7, a0
-; RV64IZBKB-NEXT:    slli a2, a2, 16
-; RV64IZBKB-NEXT:    slli a0, a0, 16
-; RV64IZBKB-NEXT:    or a1, a2, a1
-; RV64IZBKB-NEXT:    or a0, a0, a3
+; RV64IZBKB-NEXT:    packh a2, a6, a7
+; RV64IZBKB-NEXT:    packh a0, a0, a5
+; RV64IZBKB-NEXT:    packw a1, a1, a3
+; RV64IZBKB-NEXT:    packw a0, a0, a2
 ; RV64IZBKB-NEXT:    pack a0, a0, a1
 ; RV64IZBKB-NEXT:    ret
 ;