[llvm] [DAG] FoldConstantArithmetic - allow binop folding to work with differing bitcasted constants (PR #94863)

Sun Jun 9 02:55:26 PDT 2024

https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/94863

>From 205a10cbe23b342719829b9a0218a9f03a96c51e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sat, 8 Jun 2024 17:52:39 +0100
Subject: [PATCH] [DAG] FoldConstantArithmetic - allow binop folding to work
 with differing bitcasted constants

We currently only constant fold binop(bitcast(c1),bitcast(c2)) if c1 and c2 are both bitcasted and from the same type.

This patch relaxes this assumption to allow the constant build vector to originate from different types (and allow cases where only one operand was bitcasted).

We still ensure we bitcast back to one of the original types if both operand were bitcasted (we assume that if we have a non-bitcasted constant then its legal to keep using that type).
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 25 ++++++++++++-------
 llvm/test/CodeGen/ARM/vector-store.ll         | 17 ++++++-------
 llvm/test/CodeGen/X86/vshift-6.ll             |  4 +--
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ddb8a0ee8179f..523d3aea66225 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6551,17 +6551,17 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
 
   ElementCount NumElts = VT.getVectorElementCount();
 
-  // See if we can fold through bitcasted integer ops.
+  // See if we can fold through any bitcasted integer ops.
   if (NumOps == 2 && VT.isFixedLengthVector() && VT.isInteger() &&
       Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
-      Ops[0].getOpcode() == ISD::BITCAST &&
-      Ops[1].getOpcode() == ISD::BITCAST) {
+      (Ops[0].getOpcode() == ISD::BITCAST ||
+       Ops[1].getOpcode() == ISD::BITCAST)) {
     SDValue N1 = peekThroughBitcasts(Ops[0]);
     SDValue N2 = peekThroughBitcasts(Ops[1]);
     auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
     auto *BV2 = dyn_cast<BuildVectorSDNode>(N2);
-    EVT BVVT = N1.getValueType();
-    if (BV1 && BV2 && BVVT.isInteger() && BVVT == N2.getValueType()) {
+    if (BV1 && BV2 && N1.getValueType().isInteger() &&
+        N2.getValueType().isInteger()) {
       bool IsLE = getDataLayout().isLittleEndian();
       unsigned EltBits = VT.getScalarSizeInBits();
       SmallVector<APInt> RawBits1, RawBits2;
@@ -6577,15 +6577,22 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
           RawBits.push_back(*Fold);
         }
         if (RawBits.size() == NumElts.getFixedValue()) {
-          // We have constant folded, but we need to cast this again back to
-          // the original (possibly legalized) type.
+          // We have constant folded, but we might need to cast this again back
+          // to the original (possibly legalized) type.
+          EVT BVVT, BVEltVT;
+          if (N1.getValueType() == VT) {
+            BVVT = N1.getValueType();
+            BVEltVT = BV1->getOperand(0).getValueType();
+          } else {
+            BVVT = N2.getValueType();
+            BVEltVT = BV2->getOperand(0).getValueType();
+          }
+          unsigned BVEltBits = BVEltVT.getSizeInBits();
           SmallVector<APInt> DstBits;
           BitVector DstUndefs;
           BuildVectorSDNode::recastRawBits(IsLE, BVVT.getScalarSizeInBits(),
                                            DstBits, RawBits, DstUndefs,
                                            BitVector(RawBits.size(), false));
-          EVT BVEltVT = BV1->getOperand(0).getValueType();
-          unsigned BVEltBits = BVEltVT.getSizeInBits();
           SmallVector<SDValue> Ops(DstBits.size(), getUNDEF(BVEltVT));
           for (unsigned I = 0, E = DstBits.size(); I != E; ++I) {
             if (DstUndefs[I])
diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll
index 2f27786a0bf93..a0a801d2b6d2c 100644
--- a/llvm/test/CodeGen/ARM/vector-store.ll
+++ b/llvm/test/CodeGen/ARM/vector-store.ll
@@ -617,17 +617,14 @@ define void @v3i8store(ptr %p) {
 ; CHECK-LE:       @ %bb.0:
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    vmov.i32 d16, #0xff
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vmov.i32 d17, #0x0
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vand d16, d17, d16
-; CHECK-LE-NEXT:    vst1.32 {d16[0]}, [r1:32]
-; CHECK-LE-NEXT:    vld1.32 {d16[0]}, [r1:32]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    mov r2, sp
+; CHECK-LE-NEXT:    str r1, [sp]
+; CHECK-LE-NEXT:    vld1.32 {d16[0]}, [r2:32]
+; CHECK-LE-NEXT:    strb r1, [r0, #2]
 ; CHECK-LE-NEXT:    vmovl.u16 q8, d16
-; CHECK-LE-NEXT:    strb r2, [r0, #2]
-; CHECK-LE-NEXT:    vmov.32 r1, d16[0]
-; CHECK-LE-NEXT:    strh r1, [r0]
+; CHECK-LE-NEXT:    vmov.32 r2, d16[0]
+; CHECK-LE-NEXT:    strh r2, [r0]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll
index 21c8e8d3ee5d2..912ff750d9e91 100644
--- a/llvm/test/CodeGen/X86/vshift-6.ll
+++ b/llvm/test/CodeGen/X86/vshift-6.ll
@@ -32,9 +32,9 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) {
 ; X86-NEXT:    movb %al, (%ecx)
 ; X86-NEXT:    movd %eax, %xmm1
 ; X86-NEXT:    psllq $56, %xmm1
-; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-NEXT:    psllw $5, %xmm1
+; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pxor %xmm2, %xmm2
 ; X86-NEXT:    pxor %xmm0, %xmm0
 ; X86-NEXT:    pcmpgtb %xmm1, %xmm0
@@ -64,9 +64,9 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) {
 ; X64-NEXT:    movb %r9b, (%rdi)
 ; X64-NEXT:    movd %r9d, %xmm1
 ; X64-NEXT:    psllq $56, %xmm1
-; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    pcmpeqd %xmm2, %xmm2
 ; X64-NEXT:    psllw $5, %xmm1
+; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    pxor %xmm3, %xmm3
 ; X64-NEXT:    pxor %xmm0, %xmm0
 ; X64-NEXT:    pcmpgtb %xmm1, %xmm0