[llvm-branch-commits] [llvm] [AArch64] Combine undef UZP and NVCAST away. (PR #204623)

Thu Jun 18 08:33:22 PDT 2026

https://github.com/gbossu created https://github.com/llvm/llvm-project/pull/204623

These are used to lower insert_subvec nodes quite early in SDAG. After DAG combines run, it's possible that the inputs to these AArch64 nodes become UNDEF.

>From 9e9f16ab37d68ae2f3551ddc0cd0489fe6241d1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at arm.com>
Date: Thu, 18 Jun 2026 15:10:59 +0000
Subject: [PATCH] [AArch64] Combine undef UZP and NVCAST away.

These are used to lower insert_subvec nodes quite early in SDAG. After
DAG combines run, it's possible that the inputs to these AArch64 nodes
become UNDEF.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 22 ++++++++++++++-----
 .../AArch64/sve-masked-gather-64b-unscaled.ll |  9 +++-----
 .../AArch64/sve-masked-gather-legalize.ll     |  3 +--
 .../test/CodeGen/AArch64/sve-masked-gather.ll |  9 +++-----
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 67ef911117eff..e5d9a79ccbacc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25641,6 +25641,12 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
   EVT ResVT = N->getValueType(0);
 
+  // UZP is used to lower insert_subvector quite early. When later DAG combines
+  // run, it's possible to actually end up with an insert of UNDEF into UNDEF,
+  // i.e. UZP1 UNDEF, UNDEF.
+  if (Op0.isUndef() && Op1.isUndef())
+    return DAG.getUNDEF(ResVT);
+
   // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
   if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
@@ -28710,11 +28716,17 @@ static SDValue performDUPCombine(SDNode *N,
 
 /// Get rid of unnecessary NVCASTs (that don't change the type).
 static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
-  if (N->getValueType(0) == N->getOperand(0).getValueType())
-    return N->getOperand(0);
-  if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
-    return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
-                       N->getOperand(0).getOperand(0));
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+
+  if (Op.isUndef())
+    return DAG.getUNDEF(VT);
+
+  if (VT == Op.getValueType())
+    return Op;
+
+  if (Op.getOpcode() == AArch64ISD::NVCAST)
+    return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), VT, Op.getOperand(0));
 
   return SDValue();
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
index b6552968a97f3..03e4d9361076d 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
@@ -134,10 +134,8 @@ define <vscale x 16 x i8> @masked_gather_nxv1i8(ptr %base, <vscale x 2 x i64> %w
 ; CHECK-NEXT:    uzp1 p0.d, p0.d, p1.d
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    ret
   %offsets = call <vscale x 1 x i64> @llvm.vector.extract.nxv1i64.nxv2i64(
       <vscale x 2 x i64> %wide.offsets, i64 0)
@@ -156,8 +154,7 @@ define <vscale x 8 x i16> @masked_gather_nxv1i16(ptr %base, <vscale x 2 x i64> %
 ; CHECK-NEXT:    uzp1 p0.d, p0.d, p1.d
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
   %offsets = call <vscale x 1 x i64> @llvm.vector.extract.nxv1i64.nxv2i64(
       <vscale x 2 x i64> %wide.offsets, i64 0)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
index dfe1ba0a0f05c..ff3cdddce5dda 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -210,8 +210,7 @@ define <vscale x 16 x i8> @masked_gather_nxv16i8_undef_hi_mask(ptr %base, <vscal
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, ptr %base, <vscale x 16 x i8> %indices
   %mask.false.hi = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> splat (i1 false), <vscale x 4 x i1> %mask, i64 0)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather.ll
index 77329371c8e2c..fa671e9d5df9e 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather.ll
@@ -103,10 +103,8 @@ define <vscale x 16 x i8> @masked_gather_nxv1i8(<vscale x 2 x ptr> %wide.ptrs, <
 ; CHECK-NEXT:    uzp1 p0.d, p0.d, p1.d
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    ret
   %ptrs = call <vscale x 1 x ptr> @llvm.vector.extract.nxv1p0.nxv2p0(
       <vscale x 2 x ptr> %wide.ptrs, i64 0)
@@ -124,8 +122,7 @@ define <vscale x 8 x i16> @masked_gather_nxv1i16(<vscale x 2 x ptr> %wide.ptrs,
 ; CHECK-NEXT:    uzp1 p0.d, p0.d, p1.d
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
   %ptrs = call <vscale x 1 x ptr> @llvm.vector.extract.nxv1p0.nxv2p0(
       <vscale x 2 x ptr> %wide.ptrs, i64 0)