[llvm] d40b7f0 - [DAG] Fold (shl (srl x, c), c) -> and(x, m) even if srl has other uses
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue May 17 05:40:27 PDT 2022
Author: Simon Pilgrim
Date: 2022-05-17T13:40:11+01:00
New Revision: d40b7f0d5aec3b70bd2a5d42c71b1ebb5c28a03b
URL: https://github.com/llvm/llvm-project/commit/d40b7f0d5aec3b70bd2a5d42c71b1ebb5c28a03b
DIFF: https://github.com/llvm/llvm-project/commit/d40b7f0d5aec3b70bd2a5d42c71b1ebb5c28a03b.diff
LOG: [DAG] Fold (shl (srl x, c), c) -> and(x, m) even if srl has other uses
If we're using shift pairs to mask, then relax the one use limit if the shift amounts are equal - we'll only be generating a single AND node.
AArch64 has a couple of regressions due to this, so I've enforced the existing one use limit inside a AArch64TargetLowering::shouldFoldConstantShiftPairToMask callback.
Part of the work to fix the regressions in D77804
Differential Revision: https://reviews.llvm.org/D125607
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/test/CodeGen/AMDGPU/load-lo16.ll
llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
llvm/test/CodeGen/ARM/combine-movc-sub.ll
llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1bde36a01872e..2a03c777f3fa9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8937,8 +8937,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
// (and (srl x, (sub c1, c2), MASK)
// Only fold this if the inner shift has no other uses -- if it does,
// folding this will increase the total number of instructions.
- // TODO - drop hasOneUse requirement if c1 == c2?
- if (N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
+ if (N0.getOpcode() == ISD::SRL &&
+ (N0.getOperand(1) == N1 || N0.hasOneUse()) &&
TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
/*AllowUndefs*/ false,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index de60ee4181f4b..cb2beda0e8a1e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13300,6 +13300,17 @@ AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
return true;
}
+bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
+ assert(((N->getOpcode() == ISD::SHL &&
+ N->getOperand(0).getOpcode() == ISD::SRL) ||
+ (N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getOpcode() == ISD::SHL)) &&
+ "Expected shift-shift mask");
+ // Don't allow multiuse shift folding with the same shift amount.
+ return N->getOperand(0)->hasOneUse();
+}
+
bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f1b4c0ade210a..6d6b7d44b57e2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -644,6 +644,10 @@ class AArch64TargetLowering : public TargetLowering {
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
+ /// Return true if it is profitable to fold a pair of shifts into a mask.
+ bool shouldFoldConstantShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const override;
+
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index b5af6bde3122e..a80604fde3ce0 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -131,17 +131,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s1, s2, 16
-; CI-NEXT: s_lshl_b32 s2, s1, 16
-; CI-NEXT: s_or_b32 s0, s0, s2
+; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
+; CI-NEXT: s_or_b32 s0, s0, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_lshr_b32 s2, s2, 16
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s1
+; CI-NEXT: ; use s2
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index a12952948ffc0..1ac91c35b9c8d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -595,12 +595,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in,
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_mov_b32_e32 v2, 0
-; GFX803-NEXT: ds_write_b16 v2, v1
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_mov_b32_e32 v3, 0
+; GFX803-NEXT: ds_write_b16 v3, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v2
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -647,12 +647,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: ds_write_b16 v2, v0
; GFX803-NEXT: ds_write_b16 v3, v1
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 003211eea5358..dccee2d1dcef8 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -38,8 +38,8 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out,
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -85,8 +85,8 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out,
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -106,9 +106,9 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0
+; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v2
@@ -123,9 +123,8 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
-; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0
+; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -148,9 +147,9 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0
+; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v2
@@ -165,9 +164,8 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
-; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0
+; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
index e554c55d16b97..f17c2623c5744 100644
--- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll
+++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
@@ -25,15 +25,15 @@ declare void @foo(%struct.PROOFSEARCH_HELP*, %struct.CLAUSE_HELP*)
define hidden fastcc %struct.LIST_HELP* @test(%struct.PROOFSEARCH_HELP* %Search, %struct.LIST_HELP* %ClauseList, i32 %Level, %struct.LIST_HELP** nocapture %New) {
; CHECK-LABEL: test:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT: sub.w r9, r2, #32
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: sub.w r7, r2, #32
; CHECK-NEXT: mov r8, r0
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: add.w r6, r0, r9, lsr #5
+; CHECK-NEXT: add.w r6, r0, r7, lsr #5
; CHECK-NEXT: mov r5, r1
-; CHECK-NEXT: lsr.w r7, r9, #5
-; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_1: @ %for.inc
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
@@ -47,7 +47,7 @@ define hidden fastcc %struct.LIST_HELP* @test(%struct.PROOFSEARCH_HELP* %Search,
; CHECK-NEXT: add.w r0, r0, r6, lsl #2
; CHECK-NEXT: ldr r0, [r0, #40]
; CHECK-NEXT: it hi
-; CHECK-NEXT: subhi.w r2, r9, r7, lsl #5
+; CHECK-NEXT: andhi r2, r7, #31
; CHECK-NEXT: lsrs r0, r2
; CHECK-NEXT: lsls r0, r0, #31
; CHECK-NEXT: beq .LBB0_1
@@ -55,7 +55,7 @@ define hidden fastcc %struct.LIST_HELP* @test(%struct.PROOFSEARCH_HELP* %Search,
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: mov r0, r8
; CHECK-NEXT: bl foo
-; CHECK-NEXT: str.w r10, [r5, #4]
+; CHECK-NEXT: str.w r9, [r5, #4]
; CHECK-NEXT: b .LBB0_1
entry:
%cmp4.i.i = icmp ugt i32 %Level, 31
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index bc1b226aa0bea..9eb6ff75031f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -318,9 +318,8 @@ define <vscale x 1 x i8> @extract_nxv8i8_nxv1i8_7(<vscale x 8 x i8> %vec) {
; CHECK-LABEL: extract_nxv8i8_nxv1i8_7:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: slli a1, a0, 3
-; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: sub a0, a0, a1
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index a26dc48e339d5..ece1aa68bddb4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -304,11 +304,10 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vsc
; CHECK-LABEL: insert_nxv16i8_nxv1i8_7:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: slli a1, a0, 3
-; CHECK-NEXT: sub a0, a1, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, mu
-; CHECK-NEXT: vslideup.vx v8, v10, a0
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu
+; CHECK-NEXT: vslideup.vx v8, v10, a1
; CHECK-NEXT: ret
%v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
ret <vscale x 16 x i8> %v
@@ -318,11 +317,10 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vs
; CHECK-LABEL: insert_nxv16i8_nxv1i8_15:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: slli a1, a0, 3
-; CHECK-NEXT: sub a0, a1, a0
-; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, mu
-; CHECK-NEXT: vslideup.vx v9, v10, a0
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu
+; CHECK-NEXT: vslideup.vx v9, v10, a1
; CHECK-NEXT: ret
%v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
ret <vscale x 16 x i8> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
index 223536b8632ce..63d375d288136 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
@@ -36,9 +36,8 @@ define <vscale x 7 x half> @load_nxv7f16(<vscale x 7 x half>* %ptr, <vscale x 7
; CHECK-LABEL: load_nxv7f16:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: srli a2, a2, 3
-; CHECK-NEXT: slli a3, a2, 3
-; CHECK-NEXT: sub a2, a3, a2
+; CHECK-NEXT: srli a3, a2, 3
+; CHECK-NEXT: sub a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, mu
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: vse16.v v8, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
index 3f72c420147f3..55a7219050c4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
@@ -22,9 +22,8 @@ define void @store_nxv7f64(<vscale x 7 x double> %val, <vscale x 7 x double>* %p
; CHECK-LABEL: store_nxv7f64:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: srli a1, a1, 3
-; CHECK-NEXT: slli a2, a1, 3
-; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: srli a2, a1, 3
+; CHECK-NEXT: sub a1, a1, a2
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu
; CHECK-NEXT: vse64.v v8, (a0)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
index 0f3c80a57613c..897b27debc669 100644
--- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -123,15 +123,10 @@ define void @fun2(<8 x i32> %src, <8 x i31>* %p)
define void @fun3(<3 x i31>* %src, <3 x i31>* %p)
; CHECK-LABEL: fun3:
; CHECK: # %bb.0:
-; CHECK-NEXT: l %r0, 8(%r2)
+; CHECK-NEXT: llgf %r0, 8(%r2)
; CHECK-NEXT: lg %r1, 0(%r2)
-; CHECK-NEXT: sllg %r2, %r1, 32
-; CHECK-NEXT: lr %r2, %r0
-; CHECK-NEXT: st %r0, 8(%r3)
-; CHECK-NEXT: srlg %r0, %r2, 32
-; CHECK-NEXT: lr %r1, %r0
-; CHECK-NEXT: nihh %r1, 8191
; CHECK-NEXT: stg %r1, 0(%r3)
+; CHECK-NEXT: st %r0, 8(%r3)
; CHECK-NEXT: br %r14
{
%tmp = load <3 x i31>, <3 x i31>* %src
More information about the llvm-commits
mailing list