[llvm-branch-commits] [llvm] release/22.x: [CodeGen] Preserve big-endian trunc in concat_vectors (#190701) (PR #190823)

Tue Apr 7 10:08:58 PDT 2026

https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/190823

Backport 5df89ae3da8b24804c17479ce74a930783db045e

Requested by: @tuliom

>From ee3c9688bf4439f7f410ae87b352fe1c3c358b9e Mon Sep 17 00:00:00 2001
From: Josh Stone <jistone at redhat.com>
Date: Tue, 7 Apr 2026 09:57:57 -0700
Subject: [PATCH] [CodeGen] Preserve big-endian trunc in concat_vectors
 (#190701)

A transform from `concat_vectors(trunc(scalar), undef)` to
`scalar_to_vector(scalar)` is only equivalent for little-endian targets.
On big-endian, that would put the extra upper bytes ahead of the desired
truncated bytes. This problem was seen on Rust s390x in [RHEL-147748].

[RHEL-147748]: https://redhat.atlassian.net/browse/RHEL-147748

Assisted-by: Claude Code
(cherry picked from commit 5df89ae3da8b24804c17479ce74a930783db045e)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  4 +-
 llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll | 45 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2f8fe09c3dc98..3fdb9bf7e5171 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26068,9 +26068,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
       // If the bitcast type isn't legal, it might be a trunc of a legal type;
       // look through the trunc so we can still do the transform:
       //   concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
+      // However, this is only equivalent on little-endian targets.
       if (Scalar->getOpcode() == ISD::TRUNCATE &&
           !TLI.isTypeLegal(Scalar.getValueType()) &&
-          TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
+          TLI.isTypeLegal(Scalar->getOperand(0).getValueType()) &&
+          DAG.getDataLayout().isLittleEndian())
         Scalar = Scalar->getOperand(0);
 
       EVT SclTy = Scalar.getValueType();
diff --git a/llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll b/llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll
new file mode 100644
index 0000000000000..42d787d945145
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that truncated scalars use the correct vector insert instruction.
+; On big-endian targets, concat_vectors should not skip truncates when
+; creating scalar_to_vector, as the bytes would be in the wrong position.
+
+; This truncated i16 should use vlvgh (insert halfword), not vlvgf (insert fullword).
+define <16 x i8> @test_concat_trunc_i16(i32 %x) {
+; CHECK-LABEL: test_concat_trunc_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlvgh %v24, %r2, 0
+; CHECK-NEXT:    br %r14
+  %t = trunc i32 %x to i16
+  %vec = bitcast i16 %t to <2 x i8>
+  %result = shufflevector <2 x i8> %vec, <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <16 x i8> %result
+}
+
+; Test with a more complex shuffle pattern, reduced from a Rust bug report.
+define fastcc void @test_shuffle_with_trunc() {
+; CHECK-LABEL: test_shuffle_with_trunc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lh %r1, 0
+; CHECK-NEXT:    l %r0, 0
+; CHECK-NEXT:    vlvgh %v1, %r1, 0
+; CHECK-NEXT:    larl %r1, .LCPI1_0
+; CHECK-NEXT:    vl %v2, 0(%r1), 3
+; CHECK-NEXT:    vlvgf %v0, %r0, 0
+; CHECK-NEXT:    vperm %v0, %v0, %v1, %v2
+; CHECK-NEXT:    vst %v0, 0, 3
+; CHECK-NEXT:    br %r14
+  %1 = load i32, ptr null, align 8
+  %2 = load i16, ptr null, align 1
+  br label %3
+
+3:
+  %4 = bitcast i32 %1 to <4 x i8>
+  %5 = shufflevector <4 x i8> %4, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = bitcast i16 %2 to <2 x i8>
+  %7 = shufflevector <2 x i8> %6, <2 x i8> zeroinitializer, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %8 = shufflevector <16 x i8> %5, <16 x i8> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  store <16 x i8> %8, ptr null, align 8
+  ret void
+}