[llvm] [DAGCombiner][X86] Push bitcast/ext through freeze for loads (PR #163070)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 14 11:26:42 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Guy David (guy-david)
<details>
<summary>Changes</summary>
The current implementation tries to sink `FREEZE` into the operands of operations, which can generate suboptimal loads.
I've tried to fold `bitcast(freeze(x))` into `freeze(bitcast(x))` (also had to add an exclusion to `BITCAST`, as was done here: https://github.com/llvm/llvm-project/commit/55c6bda01ef5a166a69b43956775272d9d67bda5#diff-d0eb75096db76ab253fc7f8ae6343c4b4516fc619d851898cbdac1a5bf481941R15465) but then a pattern in X86 regarding boolean vectors broke, so this approach is the path of least resistance.
---
Full diff: https://github.com/llvm/llvm-project/pull/163070.diff
7 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+17)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+14)
- (modified) llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/avx10_2bf16-arith.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/avx512-ext.ll (+12-20)
- (added) llvm/test/CodeGen/X86/freeze-bitcast-ext-load.ll (+121)
- (modified) llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll (+6-6)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 787a81a99389f..5dfd3b7f2da61 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16944,6 +16944,23 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
return SDValue();
+ // fold: bitcast(freeze(load)) -> freeze(bitcast(load))
+ // fold: sext(freeze(load)) -> freeze(sext(load))
+ // fold: zext(freeze(load)) -> freeze(zext(load))
+ // This allows the conversion to potentially fold into the load.
+ if (N0.getOpcode() == ISD::LOAD && N->hasOneUse()) {
+ SDNode *User = *N->user_begin();
+ unsigned UserOpcode = User->getOpcode();
+ if (UserOpcode == ISD::BITCAST || UserOpcode == ISD::SIGN_EXTEND ||
+ UserOpcode == ISD::ZERO_EXTEND) {
+ SDValue NewConv =
+ DAG.getNode(UserOpcode, SDLoc(User), User->getValueType(0), N0);
+ SDValue FrozenConv = DAG.getFreeze(NewConv);
+ DAG.ReplaceAllUsesWith(User, FrozenConv.getNode());
+ return SDValue(N, 0);
+ }
+ }
+
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c32b1a66356ea..a273c5875b76e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3448,6 +3448,20 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
+ // With low alignment, don't convert integer vectors to large scalar loads,
+ // because otherwise they get broken into many small scalar loads.
+ if (LoadVT.isVector() && LoadVT.isInteger() && !BitcastVT.isVector() &&
+ BitcastVT.isInteger()) {
+ const DataLayout &DL = DAG.getDataLayout();
+ unsigned MinAlign = DL.getPointerSize();
+ // Aligned well, will legalize into a clean sequence of loads.
+ if (MMO.getAlign() >= MinAlign)
+ return true;
+ // Aligned poorly for a large enough scalar.
+ if (BitcastVT.getSizeInBits() > 2 * DL.getPointerSizeInBits())
+ return false;
+ }
+
// If both types are legal vectors, it's always ok to convert them.
if (LoadVT.isVector() && BitcastVT.isVector() &&
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 79849a7153c91..d9b4635042256 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
; X86: # %bb.0:
-; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 0f2c75b15d5b4..01b7618753a23 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
; X86: # %bb.0:
-; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 1a712ffac5b7e..03f283a57a217 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -212,11 +212,9 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vmovdqu (%rdi), %ymm2
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -237,11 +235,9 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vmovdqu (%rdi), %ymm2
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -261,11 +257,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vmovdqu (%rdi), %ymm2
-; KNL-NEXT: vpmovsxbw %xmm2, %ymm3
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT: vpmovsxbw %xmm2, %ymm2
-; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; KNL-NEXT: vpmovsxbw (%rdi), %ymm2
+; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm3
+; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -286,11 +280,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vmovdqu (%rdi), %ymm2
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm2, %ymm3
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm2
+; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm3
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/freeze-bitcast-ext-load.ll b/llvm/test/CodeGen/X86/freeze-bitcast-ext-load.ll
new file mode 100644
index 0000000000000..3699ade8d4188
--- /dev/null
+++ b/llvm/test/CodeGen/X86/freeze-bitcast-ext-load.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define double @test_bitcast_freeze_load(ptr %p) {
+; CHECK-LABEL: test_bitcast_freeze_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retq
+ %v = load <2 x float>, ptr %p
+ %f = freeze <2 x float> %v
+ %b = bitcast <2 x float> %f to double
+ ret double %b
+}
+
+define i32 @test_sext_freeze_load_i8(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movsbl (%rdi), %eax
+; CHECK-NEXT: retq
+ %v = load i8, ptr %p
+ %f = freeze i8 %v
+ %e = sext i8 %f to i32
+ ret i32 %e
+}
+
+define i64 @test_sext_freeze_load_i32(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movslq (%rdi), %rax
+; CHECK-NEXT: retq
+ %v = load i32, ptr %p
+ %f = freeze i32 %v
+ %e = sext i32 %f to i64
+ ret i64 %e
+}
+
+define i64 @test_sext_freeze_load_i16(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movswq (%rdi), %rax
+; CHECK-NEXT: retq
+ %v = load i16, ptr %p
+ %f = freeze i16 %v
+ %e = sext i16 %f to i64
+ ret i64 %e
+}
+
+define i32 @test_zext_freeze_load_i8(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: retq
+ %v = load i8, ptr %p
+ %f = freeze i8 %v
+ %e = zext i8 %f to i32
+ ret i32 %e
+}
+
+define i64 @test_zext_freeze_load_i32(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: retq
+ %v = load i32, ptr %p
+ %f = freeze i32 %v
+ %e = zext i32 %f to i64
+ ret i64 %e
+}
+
+define i64 @test_zext_freeze_load_i16(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzwl (%rdi), %eax
+; CHECK-NEXT: retq
+ %v = load i16, ptr %p
+ %f = freeze i16 %v
+ %e = zext i16 %f to i64
+ ret i64 %e
+}
+
+define i32 @test_sext_freeze_load_multiuse(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_multiuse:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: movsbl %al, %ecx
+; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: retq
+ %v = load i8, ptr %p
+ %f = freeze i8 %v
+ %e = sext i8 %f to i32
+ %z = zext i8 %f to i32
+ %r = add i32 %e, %z
+ ret i32 %r
+}
+
+define <4 x i32> @test_sext_freeze_load_v4i16(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: retq
+ %v = load <4 x i16>, ptr %p
+ %f = freeze <4 x i16> %v
+ %e = sext <4 x i16> %f to <4 x i32>
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_zext_freeze_load_v4i16(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retq
+ %v = load <4 x i16>, ptr %p
+ %f = freeze <4 x i16> %v
+ %e = zext <4 x i16> %f to <4 x i32>
+ ret <4 x i32> %e
+}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 81c4d5d71084c..fce622a99bb6a 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -171,8 +171,8 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -180,8 +180,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movb %al, (%rdx)
; X64-BMI2-NEXT: retq
@@ -248,8 +248,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -257,8 +257,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movw %ax, (%rdx)
; X64-BMI2-NEXT: retq
@@ -324,8 +324,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -333,8 +333,8 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movl %eax, (%rdx)
; X64-BMI2-NEXT: retq
``````````
</details>
https://github.com/llvm/llvm-project/pull/163070
More information about the llvm-commits
mailing list