[llvm] 259742a - [SelectionDAG] Use unaligned store/load to move AVX registers onto stack for `insertelement` (#82130)

Fri Aug 9 07:39:57 PDT 2024

Author: Manish Kausik H
Date: 2024-08-09T15:39:54+01:00
New Revision: 259742a88558325d76d9ec7e71e11a785e499af9

URL: https://github.com/llvm/llvm-project/commit/259742a88558325d76d9ec7e71e11a785e499af9
DIFF: https://github.com/llvm/llvm-project/commit/259742a88558325d76d9ec7e71e11a785e499af9.diff

LOG: [SelectionDAG] Use unaligned store/load to move AVX registers onto stack for `insertelement` (#82130)

Prior to this patch, SelectionDAG generated aligned move onto stacks for
AVX registers when the function was marked as a no-realign-stack
function. This lead to misalignment between the stack and the
instruction generated. This patch fixes the issue. There was a similar
issue reported for `extractelement` which was fixed in
a6614ec5b7c1dbfc4b847884c5de780cf75e8e9c

Co-authored-by: Manish Kausik H <hmamishkausik at gmail.com>

Added: 
    llvm/test/CodeGen/X86/insert-into-vector-through-stack-no-stack-realign.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bdb7917073020b..f2d40d28b3100c 100644

--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1474,11 +1474,17 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // First store the whole vector.
-  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
+  Align BaseVecAlignment =
+      DAG.getMachineFunction().getFrameInfo().getObjectAlign(FI);
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                            BaseVecAlignment);
 
   // Freeze the index so we don't poison the clamping code we're about to emit.
   Idx = DAG.getFreeze(Idx);
 
+  Type *PartTy = PartVT.getTypeForEVT(*DAG.getContext());
+  Align PartAlignment = DAG.getDataLayout().getPrefTypeAlign(PartTy);
+
   // Then store the inserted part.
   if (PartVT.isVector()) {
     SDValue SubStackPtr =
@@ -1487,7 +1493,8 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
     // Store the subvector.
     Ch = DAG.getStore(
         Ch, dl, Part, SubStackPtr,
-        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+        PartAlignment);
   } else {
     SDValue SubStackPtr =
         TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
@@ -1496,11 +1503,15 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
     Ch = DAG.getTruncStore(
         Ch, dl, Part, SubStackPtr,
         MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
-        VecVT.getVectorElementType());
+        VecVT.getVectorElementType(), PartAlignment);
   }
 
+  assert(cast<StoreSDNode>(Ch)->getAlign() == PartAlignment &&
+         "ElementAlignment does not match!");
+
   // Finally, load the updated vector.
-  return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
+  return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo,
+                     BaseVecAlignment);
 }
 
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {

diff  --git a/llvm/test/CodeGen/X86/insert-into-vector-through-stack-no-stack-realign.ll b/llvm/test/CodeGen/X86/insert-into-vector-through-stack-no-stack-realign.ll
new file mode 100644
index 00000000000000..8006263762152d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/insert-into-vector-through-stack-no-stack-realign.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s
+
+define <8 x i32> @foo(<8 x i32> %arg1, i32 %n) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    movl $42, -40(%rsp,%rdi,4)
+; CHECK-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %a = insertelement <8 x i32> %arg1, i32 42, i32 %n
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @foo2(<8 x i32> %arg1, i32 %n) alignstack(8) #0 {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    movl $42, -32(%rsp,%rdi,4)
+; CHECK-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %a = insertelement <8 x i32> %arg1, i32 42, i32 %n
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @foo3(<8 x i32> %arg1, i32 %n) alignstack(16) #0 {
+; CHECK-LABEL: foo3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    movl $42, -40(%rsp,%rdi,4)
+; CHECK-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %a = insertelement <8 x i32> %arg1, i32 42, i32 %n
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @foo4(<8 x i32> %arg1, i32 %n) alignstack(64) #0 {
+; CHECK-LABEL: foo4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vmovaps %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    movl $42, -56(%rsp,%rdi,4)
+; CHECK-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %a = insertelement <8 x i32> %arg1, i32 42, i32 %n
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @foo5(<8 x i32> %arg1, i32 %n) alignstack(256) #0 {
+; CHECK-LABEL: foo5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $120, %rsp
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    movl $42, 64(%rsp,%rdi,4)
+; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
+; CHECK-NEXT:    addq $120, %rsp
+; CHECK-NEXT:    retq
+entry:
+  %a = insertelement <8 x i32> %arg1, i32 42, i32 %n
+  ret <8 x i32> %a
+}
+
+define <8 x i16> @foo6(<8 x i16> %arg1, i32 %n) #0 {
+; CHECK-LABEL: foo6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    movw $42, -24(%rsp,%rdi,2)
+; CHECK-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %a = insertelement <8 x i16> %arg1, i16 42, i32 %n
+  ret <8 x i16> %a
+}
+
+define <8 x i8> @foo7(<8 x i8> %arg1, i32 %n) #0 {
+; CHECK-LABEL: foo7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $15, %edi
+; CHECK-NEXT:    movb $42, -24(%rsp,%rdi)
+; CHECK-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %a = insertelement <8 x i8> %arg1, i8 42, i32 %n
+  ret <8 x i8> %a
+}
+
+define <8 x i64> @foo8(<8 x i64> %arg1, i32 %n) #0 {
+; CHECK-LABEL: foo8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    movq $42, -72(%rsp,%rdi,8)
+; CHECK-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm0
+; CHECK-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm1
+; CHECK-NEXT:    retq
+entry:
+  %a = insertelement <8 x i64> %arg1, i64 42, i32 %n
+  ret <8 x i64> %a
+}
+
+attributes #0 = { "no-realign-stack" nounwind }