[llvm] [X86] mayFoldIntoVector - relax load alignment requirements (PR #171830)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 06:11:57 PST 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/171830
>From b1eb31580a52343fe09b8933bf9975dc996da6b9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 11 Dec 2025 13:54:06 +0000
Subject: [PATCH] [X86] mayFoldIntoVector - relax load alignment requirements
If we're trying to move big integers to vector types, relax the SSE alignment requirements - unlike regular uses of mayFoldLoad, we're not testing to confirm every load will fold into a vector op, just that it can move to the FPU.
Fixes #144861
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +-
llvm/lib/Target/X86/X86ISelLowering.h | 3 +-
llvm/test/CodeGen/X86/ptest.ll | 190 +++++++++++++++---------
3 files changed, 130 insertions(+), 73 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f8730a3de11c5..6e0bd4de31328 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2805,7 +2805,7 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
//===----------------------------------------------------------------------===//
bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
- bool AssumeSingleUse) {
+ bool AssumeSingleUse, bool IgnoreAlignment) {
if (!AssumeSingleUse && !Op.hasOneUse())
return false;
if (!ISD::isNormalLoad(Op.getNode()))
@@ -2813,8 +2813,9 @@ bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
// If this is an unaligned vector, make sure the target supports folding it.
auto *Ld = cast<LoadSDNode>(Op.getNode());
- if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
- Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
+ if (!IgnoreAlignment && !Subtarget.hasAVX() &&
+ !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
+ Ld->getAlign() < Align(16))
return false;
// TODO: If this is a non-temporal load and the target has an instruction
@@ -2864,7 +2865,8 @@ static bool mayFoldIntoVector(SDValue Op, const X86Subtarget &Subtarget) {
return true;
if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
return true;
- return X86::mayFoldLoad(Op, Subtarget);
+ return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/false,
+ /*IgnoreAlignment=*/true);
}
static bool isLogicOp(unsigned Opcode) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 848fe4bf86d2c..a528c311975d8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1042,7 +1042,8 @@ namespace llvm {
/// Check if Op is a load operation that could be folded into some other x86
/// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
- bool AssumeSingleUse = false);
+ bool AssumeSingleUse = false,
+ bool IgnoreAlignment = false);
/// Check if Op is a load operation that could be folded into a vector splat
/// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2.
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 166b7abc9e053..9c8fc4f3896b6 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -433,15 +433,26 @@ define i1 @vecmp_load64x4(ptr %p0) {
}
define i1 @vecmp_load128x2(ptr %p0) {
-; SSE-LABEL: vecmp_load128x2:
-; SSE: # %bb.0:
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: orq 24(%rdi), %rcx
-; SSE-NEXT: orq 16(%rdi), %rax
-; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: sete %al
-; SSE-NEXT: retq
+; SSE2-LABEL: vecmp_load128x2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu 16(%rdi), %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: movmskps %xmm0, %eax
+; SSE2-NEXT: xorl $15, %eax
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: vecmp_load128x2:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqu (%rdi), %xmm0
+; SSE41-NEXT: movdqu 16(%rdi), %xmm1
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: ptest %xmm1, %xmm1
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retq
;
; AVX-LABEL: vecmp_load128x2:
; AVX: # %bb.0:
@@ -461,21 +472,34 @@ define i1 @vecmp_load128x2(ptr %p0) {
}
define i1 @vecmp_load128x4(ptr %p0) {
-; SSE-LABEL: vecmp_load128x4:
-; SSE: # %bb.0:
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq 24(%rdi), %rdx
-; SSE-NEXT: movq 16(%rdi), %rsi
-; SSE-NEXT: orq 32(%rdi), %rax
-; SSE-NEXT: orq 40(%rdi), %rcx
-; SSE-NEXT: orq 48(%rdi), %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: orq 56(%rdi), %rdx
-; SSE-NEXT: orq %rcx, %rdx
-; SSE-NEXT: orq %rsi, %rdx
-; SSE-NEXT: sete %al
-; SSE-NEXT: retq
+; SSE2-LABEL: vecmp_load128x4:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu 16(%rdi), %xmm1
+; SSE2-NEXT: movdqu 32(%rdi), %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movdqu 48(%rdi), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: movmskps %xmm1, %eax
+; SSE2-NEXT: xorl $15, %eax
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: vecmp_load128x4:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqu (%rdi), %xmm0
+; SSE41-NEXT: movdqu 16(%rdi), %xmm1
+; SSE41-NEXT: movdqu 32(%rdi), %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: movdqu 48(%rdi), %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retq
;
; AVX1-LABEL: vecmp_load128x4:
; AVX1: # %bb.0:
@@ -515,21 +539,34 @@ define i1 @vecmp_load128x4(ptr %p0) {
; PR144861
define i1 @vecmp_load256x2(ptr %p0) {
-; SSE-LABEL: vecmp_load256x2:
-; SSE: # %bb.0:
-; SSE-NEXT: movq 24(%rdi), %rax
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq 16(%rdi), %rsi
-; SSE-NEXT: orq 48(%rdi), %rsi
-; SSE-NEXT: orq 32(%rdi), %rcx
-; SSE-NEXT: orq %rsi, %rcx
-; SSE-NEXT: orq 56(%rdi), %rax
-; SSE-NEXT: orq 40(%rdi), %rdx
-; SSE-NEXT: orq %rax, %rdx
-; SSE-NEXT: orq %rcx, %rdx
-; SSE-NEXT: sete %al
-; SSE-NEXT: retq
+; SSE2-LABEL: vecmp_load256x2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu 16(%rdi), %xmm1
+; SSE2-NEXT: movdqu 32(%rdi), %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movdqu 48(%rdi), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: movmskps %xmm1, %eax
+; SSE2-NEXT: xorl $15, %eax
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: vecmp_load256x2:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqu (%rdi), %xmm0
+; SSE41-NEXT: movdqu 16(%rdi), %xmm1
+; SSE41-NEXT: movdqu 32(%rdi), %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: movdqu 48(%rdi), %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retq
;
; AVX1-LABEL: vecmp_load256x2:
; AVX1: # %bb.0:
@@ -559,33 +596,50 @@ define i1 @vecmp_load256x2(ptr %p0) {
}
define i1 @vecmp_load512x2(ptr %p0) {
-; SSE-LABEL: vecmp_load512x2:
-; SSE: # %bb.0:
-; SSE-NEXT: movq 24(%rdi), %rax
-; SSE-NEXT: movq 56(%rdi), %rdx
-; SSE-NEXT: movq 40(%rdi), %rsi
-; SSE-NEXT: movq 16(%rdi), %rcx
-; SSE-NEXT: movq 48(%rdi), %r8
-; SSE-NEXT: movq (%rdi), %r9
-; SSE-NEXT: movq 8(%rdi), %r10
-; SSE-NEXT: movq 32(%rdi), %r11
-; SSE-NEXT: orq 96(%rdi), %r11
-; SSE-NEXT: orq 64(%rdi), %r9
-; SSE-NEXT: orq %r11, %r9
-; SSE-NEXT: orq 112(%rdi), %r8
-; SSE-NEXT: orq 80(%rdi), %rcx
-; SSE-NEXT: orq %r8, %rcx
-; SSE-NEXT: orq %r9, %rcx
-; SSE-NEXT: orq 104(%rdi), %rsi
-; SSE-NEXT: orq 72(%rdi), %r10
-; SSE-NEXT: orq %rsi, %r10
-; SSE-NEXT: orq 120(%rdi), %rdx
-; SSE-NEXT: orq 88(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: sete %al
-; SSE-NEXT: retq
+; SSE2-LABEL: vecmp_load512x2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu 16(%rdi), %xmm1
+; SSE2-NEXT: movdqu 32(%rdi), %xmm2
+; SSE2-NEXT: movdqu 48(%rdi), %xmm3
+; SSE2-NEXT: movdqu 80(%rdi), %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: movdqu 112(%rdi), %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqu 64(%rdi), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqu 96(%rdi), %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: movmskps %xmm1, %eax
+; SSE2-NEXT: xorl $15, %eax
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: vecmp_load512x2:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqu (%rdi), %xmm0
+; SSE41-NEXT: movdqu 16(%rdi), %xmm1
+; SSE41-NEXT: movdqu 32(%rdi), %xmm2
+; SSE41-NEXT: movdqu 48(%rdi), %xmm3
+; SSE41-NEXT: movdqu 80(%rdi), %xmm4
+; SSE41-NEXT: por %xmm1, %xmm4
+; SSE41-NEXT: movdqu 112(%rdi), %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
+; SSE41-NEXT: por %xmm4, %xmm1
+; SSE41-NEXT: movdqu 64(%rdi), %xmm3
+; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: movdqu 96(%rdi), %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retq
;
; AVX1-LABEL: vecmp_load512x2:
; AVX1: # %bb.0:
More information about the llvm-commits
mailing list