[llvm] [DAG] FoldConstantArithmetic - allow binop folding to work with differing bitcasted constants (PR #94863)

Sat Jun 8 09:55:29 PDT 2024

https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/94863

We currently only constant fold binop(bitcast(c1),bitcast(c2)) if c1 and c2 are both bitcasted and from the same type.

This patch relaxes this assumption to allow the constant build vector to originate from different types (and allow cases where only one operand was bitcasted).

We still ensure we bitcast back to one of the original types if both operand were bitcasted (we assume that if we have a non-bitcasted constant then its legal to keep using that type).

>From 09aad160ff594c8772829f23c5d3dfca2b4b2c6d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sat, 8 Jun 2024 17:52:39 +0100
Subject: [PATCH] [DAG] FoldConstantArithmetic - allow binop folding to work
 with differing bitcasted constants

We currently only constant fold binop(bitcast(c1),bitcast(c2)) if c1 and c2 are both bitcasted and from the same type.

This patch relaxes this assumption to allow the constant build vector to originate from different types (and allow cases where only one operand was bitcasted).

We still ensure we bitcast back to one of the original types if both operand were bitcasted (we assume that if we have a non-bitcasted constant then its legal to keep using that type).
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 25 ++++++++++++-------
 llvm/test/CodeGen/ARM/vector-store.ll         | 17 ++++++-------
 llvm/test/CodeGen/X86/vshift-6.ll             |  4 +--
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ddb8a0ee8179f..855cbd46042b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6551,17 +6551,17 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
 
   ElementCount NumElts = VT.getVectorElementCount();
 
-  // See if we can fold through bitcasted integer ops.
+  // See if we can fold through any bitcasted integer ops.
   if (NumOps == 2 && VT.isFixedLengthVector() && VT.isInteger() &&
       Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
-      Ops[0].getOpcode() == ISD::BITCAST &&
-      Ops[1].getOpcode() == ISD::BITCAST) {
+      (Ops[0].getOpcode() == ISD::BITCAST ||
+       Ops[1].getOpcode() == ISD::BITCAST)) {
     SDValue N1 = peekThroughBitcasts(Ops[0]);
     SDValue N2 = peekThroughBitcasts(Ops[1]);
     auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
     auto *BV2 = dyn_cast<BuildVectorSDNode>(N2);
-    EVT BVVT = N1.getValueType();
-    if (BV1 && BV2 && BVVT.isInteger() && BVVT == N2.getValueType()) {
+    if (BV1 && BV2 && N1.getValueType().isInteger() && 
+        N2.getValueType().isInteger()) {
       bool IsLE = getDataLayout().isLittleEndian();
       unsigned EltBits = VT.getScalarSizeInBits();
       SmallVector<APInt> RawBits1, RawBits2;
@@ -6577,15 +6577,22 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
           RawBits.push_back(*Fold);
         }
         if (RawBits.size() == NumElts.getFixedValue()) {
-          // We have constant folded, but we need to cast this again back to
-          // the original (possibly legalized) type.
+          // We have constant folded, but we might need to cast this again back
+          // to the original (possibly legalized) type.
+          EVT BVVT, BVEltVT;
+          if (N1.getValueType() == VT) {
+            BVVT = N1.getValueType();
+            BVEltVT = BV1->getOperand(0).getValueType();
+          } else {
+            BVVT = N2.getValueType();
+            BVEltVT = BV2->getOperand(0).getValueType();
+          }
+          unsigned BVEltBits = BVEltVT.getSizeInBits();
           SmallVector<APInt> DstBits;
           BitVector DstUndefs;
           BuildVectorSDNode::recastRawBits(IsLE, BVVT.getScalarSizeInBits(),
                                            DstBits, RawBits, DstUndefs,
                                            BitVector(RawBits.size(), false));
-          EVT BVEltVT = BV1->getOperand(0).getValueType();
-          unsigned BVEltBits = BVEltVT.getSizeInBits();
           SmallVector<SDValue> Ops(DstBits.size(), getUNDEF(BVEltVT));
           for (unsigned I = 0, E = DstBits.size(); I != E; ++I) {
             if (DstUndefs[I])
diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll
index a8a1031637afc..9c8ea7a2c440c 100644
--- a/llvm/test/CodeGen/ARM/vector-store.ll
+++ b/llvm/test/CodeGen/ARM/vector-store.ll
@@ -403,17 +403,14 @@ define void @v3i8store(ptr %p) {
 ; CHECK-LABEL: v3i8store:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    vmov.i32 d16, #0xff
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vmov.i32 d17, #0x0
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vand d16, d17, d16
-; CHECK-NEXT:    vst1.32 {d16[0]}, [r1:32]
-; CHECK-NEXT:    vld1.32 {d16[0]}, [r1:32]
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    str r1, [sp]
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r2:32]
+; CHECK-NEXT:    strb r1, [r0, #2]
 ; CHECK-NEXT:    vmovl.u16 q8, d16
-; CHECK-NEXT:    strb r2, [r0, #2]
-; CHECK-NEXT:    vmov.32 r1, d16[0]
-; CHECK-NEXT:    strh r1, [r0]
+; CHECK-NEXT:    vmov.32 r2, d16[0]
+; CHECK-NEXT:    strh r2, [r0]
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
   store <3 x i8> zeroinitializer, ptr %p, align 4
diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll
index 21c8e8d3ee5d2..912ff750d9e91 100644
--- a/llvm/test/CodeGen/X86/vshift-6.ll
+++ b/llvm/test/CodeGen/X86/vshift-6.ll
@@ -32,9 +32,9 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) {
 ; X86-NEXT:    movb %al, (%ecx)
 ; X86-NEXT:    movd %eax, %xmm1
 ; X86-NEXT:    psllq $56, %xmm1
-; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-NEXT:    psllw $5, %xmm1
+; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pxor %xmm2, %xmm2
 ; X86-NEXT:    pxor %xmm0, %xmm0
 ; X86-NEXT:    pcmpgtb %xmm1, %xmm0
@@ -64,9 +64,9 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) {
 ; X64-NEXT:    movb %r9b, (%rdi)
 ; X64-NEXT:    movd %r9d, %xmm1
 ; X64-NEXT:    psllq $56, %xmm1
-; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    pcmpeqd %xmm2, %xmm2
 ; X64-NEXT:    psllw $5, %xmm1
+; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    pxor %xmm3, %xmm3
 ; X64-NEXT:    pxor %xmm0, %xmm0
 ; X64-NEXT:    pcmpgtb %xmm1, %xmm0