[llvm] [SDAG] Improve `SimplifyDemandedBits` for mul (PR #90034)

Yingwei Zheng via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 25 02:41:31 PDT 2024


https://github.com/dtcxzyw created https://github.com/llvm/llvm-project/pull/90034

If the RHS is a constant with X trailing zeros, then the X MSBs of the LHS are not demanded.

Alive2: https://alive2.llvm.org/ce/z/F5CyJW
Fixes https://github.com/llvm/llvm-project/issues/56645.


>From 2874c9a56d6cd8786a56505c16d7852866ad2203 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Thu, 25 Apr 2024 17:02:22 +0800
Subject: [PATCH 1/2] [SDAG] Add pre-commit tests. NFC.

---
 llvm/test/CodeGen/RISCV/mul.ll | 192 +++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 364e8c7b38dacc..161c18cec17ce4 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1843,3 +1843,195 @@ define i8 @mulsub_demand_2(i8 %x, i8 %y) nounwind {
   %r = or i8 %a, 240
   ret i8 %r
 }
+
+define i64 @muland_demand(i64 %x) nounwind {
+; RV32I-LABEL: muland_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    andi a0, a0, -8
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __muldi3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IM-LABEL: muland_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a1, 2
+; RV32IM-NEXT:    srli a1, a1, 2
+; RV32IM-NEXT:    andi a0, a0, -8
+; RV32IM-NEXT:    li a2, 12
+; RV32IM-NEXT:    mulhu a3, a0, a2
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    add a1, a3, a1
+; RV32IM-NEXT:    mul a0, a0, a2
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muland_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -29
+; RV64I-NEXT:    srli a1, a1, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    li a1, 12
+; RV64I-NEXT:    tail __muldi3
+;
+; RV64IM-LABEL: muland_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, -29
+; RV64IM-NEXT:    srli a1, a1, 2
+; RV64IM-NEXT:    and a0, a0, a1
+; RV64IM-NEXT:    li a1, 12
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    ret
+  %and = and i64 %x, 4611686018427387896
+  %mul = mul i64 %and, 12
+  ret i64 %mul
+}
+
+define i64 @mulzext_demand(i32 signext %x) nounwind {
+; RV32I-LABEL: mulzext_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a3, 3
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a2, 0
+; RV32I-NEXT:    call __muldi3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IM-LABEL: mulzext_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 1
+; RV32IM-NEXT:    add a1, a1, a0
+; RV32IM-NEXT:    li a0, 0
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulzext_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    li a1, 3
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    tail __muldi3
+;
+; RV64IM-LABEL: mulzext_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 32
+; RV64IM-NEXT:    srli a0, a0, 32
+; RV64IM-NEXT:    li a1, 3
+; RV64IM-NEXT:    slli a1, a1, 32
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    ret
+  %ext = zext i32 %x to i64
+  %mul = mul i64 %ext, 12884901888
+  ret i64 %mul
+}
+
+define i32 @mulfshl_demand(i32 signext %x) nounwind {
+; RV32I-LABEL: mulfshl_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a0, 11
+; RV32I-NEXT:    slli a0, a0, 21
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    lui a1, 92808
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: mulfshl_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    srli a1, a0, 11
+; RV32IM-NEXT:    slli a0, a0, 21
+; RV32IM-NEXT:    or a0, a0, a1
+; RV32IM-NEXT:    lui a1, 92808
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulfshl_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srliw a1, a0, 11
+; RV64I-NEXT:    slliw a0, a0, 21
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, 92808
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: mulfshl_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    srliw a1, a0, 11
+; RV64IM-NEXT:    slli a0, a0, 21
+; RV64IM-NEXT:    or a0, a0, a1
+; RV64IM-NEXT:    lui a1, 92808
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %fshl = tail call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 21)
+  %mul = mul i32 %fshl, 380141568
+  ret i32 %mul
+}
+
+define i32 @mulor_demand(i32 signext %x, i32 signext %y) nounwind {
+; RV32I-LABEL: mulor_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui a1, 2560
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    or a0, a0, s0
+; RV32I-NEXT:    lui a1, 92808
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: mulor_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    lui a2, 2560
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    or a0, a1, a0
+; RV32IM-NEXT:    lui a1, 92808
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulor_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a2, a1
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lui a1, 2560
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    or a0, a0, s0
+; RV64I-NEXT:    lui a1, 92808
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: mulor_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a2, 2560
+; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    or a0, a1, a0
+; RV64IM-NEXT:    lui a1, 92808
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %mul1 = mul i32 %y, 10485760
+  %or = or disjoint i32 %mul1, %x
+  %mul2 = mul i32 %or, 380141568
+  ret i32 %mul2
+}

>From 3e155e8370c3d8126ffb614fb80a07fe93bb3b35 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Thu, 25 Apr 2024 17:28:02 +0800
Subject: [PATCH 2/2] [SDAG] Improve `SimplifyDemandedBits` for mul

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  13 +-
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll   | 536 +++++++++---------
 llvm/test/CodeGen/RISCV/mul.ll                |  55 +-
 .../CodeGen/RISCV/rv64-legal-i32/rv64zba.ll   |   9 -
 llvm/test/CodeGen/RISCV/rv64zba.ll            |   9 -
 llvm/test/CodeGen/RISCV/sextw-removal.ll      |  19 +-
 llvm/test/CodeGen/X86/combine-srem.ll         |   4 +-
 llvm/test/CodeGen/X86/pmul.ll                 |  11 +-
 llvm/test/CodeGen/X86/shrink_vmul.ll          |   4 +-
 9 files changed, 303 insertions(+), 357 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c938b3996be393..de294ec0203468 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2782,10 +2782,17 @@ bool TargetLowering::SimplifyDemandedBits(
     unsigned DemandedBitsLZ = DemandedBits.countl_zero();
     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
     KnownBits KnownOp0, KnownOp1;
-    if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, KnownOp0, TLO,
-                             Depth + 1) ||
-        SimplifyDemandedBits(Op1, LoMask, DemandedElts, KnownOp1, TLO,
+    auto GetDemandedBitsLHSMask = [&](APInt Demanded,
+                                      const KnownBits &KnownRHS) {
+      if (Op.getOpcode() == ISD::MUL)
+        Demanded &= APInt::getLowBitsSet(
+            BitWidth, BitWidth - KnownRHS.countMinTrailingZeros());
+      return Demanded;
+    };
+    if (SimplifyDemandedBits(Op1, LoMask, DemandedElts, KnownOp1, TLO,
                              Depth + 1) ||
+        SimplifyDemandedBits(Op0, GetDemandedBitsLHSMask(LoMask, KnownOp1),
+                             DemandedElts, KnownOp0, TLO, Depth + 1) ||
         // See if the operation should be performed at a smaller bit width.
         ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
       if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 736f66c935e749..40b8a47f92aa70 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1709,289 +1709,289 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    fmov s4, w0
 ; CHECK-NEXT:    ldr b0, [sp, #80]
 ; CHECK-NEXT:    add x8, sp, #88
-; CHECK-NEXT:    ldr b2, [sp, #144]
-; CHECK-NEXT:    fmov s4, w0
+; CHECK-NEXT:    ldr b1, [sp, #144]
 ; CHECK-NEXT:    add x10, sp, #152
-; CHECK-NEXT:    ldr b3, [sp, #16]
+; CHECK-NEXT:    ldr b6, [sp, #16]
 ; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ldr b1, [sp, #344]
 ; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #352
+; CHECK-NEXT:    ldr b2, [sp, #344]
 ; CHECK-NEXT:    mov v4.b[1], w1
+; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    ld1 { v6.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #352
 ; CHECK-NEXT:    add x8, sp, #104
 ; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
 ; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #32
-; CHECK-NEXT:    add x12, sp, #360
-; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-NEXT:    add x10, sp, #32
 ; CHECK-NEXT:    add x11, sp, #112
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    ld1 { v1.b }[2], [x12]
-; CHECK-NEXT:    add x12, sp, #168
-; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
 ; CHECK-NEXT:    mov v4.b[2], w2
-; CHECK-NEXT:    ld1 { v2.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #40
-; CHECK-NEXT:    ld1 { v3.b }[3], [x12]
-; CHECK-NEXT:    add x13, sp, #176
-; CHECK-NEXT:    ldr b16, [sp, #216]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #48
-; CHECK-NEXT:    add x12, sp, #368
-; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #168
+; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-NEXT:    ldr b5, [sp, #216]
 ; CHECK-NEXT:    add x13, sp, #224
-; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1 { v1.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #40
+; CHECK-NEXT:    add x12, sp, #120
+; CHECK-NEXT:    ld1 { v6.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #176
+; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
 ; CHECK-NEXT:    mov v4.b[3], w3
-; CHECK-NEXT:    ld1 { v3.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v16.b }[1], [x13]
-; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v1.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #184
-; CHECK-NEXT:    ldr b5, [sp, #280]
-; CHECK-NEXT:    add x11, sp, #376
-; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x12]
-; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #48
+; CHECK-NEXT:    add x8, sp, #360
+; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
+; CHECK-NEXT:    add x13, sp, #56
+; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
+; CHECK-NEXT:    ldr b7, [sp, #280]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-NEXT:    add x15, sp, #232
+; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
+; CHECK-NEXT:    add x14, sp, #184
 ; CHECK-NEXT:    mov v4.b[4], w4
+; CHECK-NEXT:    ld1 { v5.b }[2], [x15]
+; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1 { v6.b }[5], [x13]
+; CHECK-NEXT:    add x13, sp, #288
+; CHECK-NEXT:    add x10, sp, #368
+; CHECK-NEXT:    ld1 { v7.b }[1], [x13]
+; CHECK-NEXT:    ld1 { v1.b }[5], [x14]
+; CHECK-NEXT:    ld1 { v2.b }[3], [x10]
+; CHECK-NEXT:    add x15, sp, #240
 ; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #288
-; CHECK-NEXT:    add x15, sp, #64
-; CHECK-NEXT:    ld1 { v16.b }[2], [x10]
-; CHECK-NEXT:    ldr b17, [sp, #408]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x9]
-; CHECK-NEXT:    add x14, sp, #192
-; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v3.b }[6], [x15]
-; CHECK-NEXT:    add x15, sp, #416
-; CHECK-NEXT:    ld1 { v2.b }[6], [x14]
-; CHECK-NEXT:    add x14, sp, #240
-; CHECK-NEXT:    ld1 { v17.b }[1], [x15]
 ; CHECK-NEXT:    add x9, sp, #296
-; CHECK-NEXT:    add x8, sp, #136
 ; CHECK-NEXT:    mov v4.b[5], w5
-; CHECK-NEXT:    add x13, sp, #384
-; CHECK-NEXT:    ld1 { v16.b }[3], [x14]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v1.b }[5], [x13]
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #424
-; CHECK-NEXT:    add x9, sp, #248
-; CHECK-NEXT:    ld1 { v17.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #304
-; CHECK-NEXT:    add x10, sp, #392
-; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-NEXT:    add x11, sp, #192
+; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
+; CHECK-NEXT:    ldr b3, [sp, #408]
+; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
+; CHECK-NEXT:    add x12, sp, #64
+; CHECK-NEXT:    add x13, sp, #376
+; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #416
+; CHECK-NEXT:    ld1 { v6.b }[6], [x12]
+; CHECK-NEXT:    add x12, sp, #248
+; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
 ; CHECK-NEXT:    mov v4.b[6], w6
-; CHECK-NEXT:    ld1 { v1.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #432
-; CHECK-NEXT:    add x9, sp, #256
-; CHECK-NEXT:    ld1 { v17.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #312
-; CHECK-NEXT:    ldr b22, [sp, #608]
-; CHECK-NEXT:    add x8, sp, #400
-; CHECK-NEXT:    ld1 { v16.b }[5], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[4], [x10]
-; CHECK-NEXT:    add x9, sp, #616
-; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #440
-; CHECK-NEXT:    ld1 { v22.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
+; CHECK-NEXT:    add x11, sp, #304
+; CHECK-NEXT:    ld1 { v5.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x11]
+; CHECK-NEXT:    add x8, sp, #136
+; CHECK-NEXT:    add x15, sp, #384
+; CHECK-NEXT:    add x9, sp, #424
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x15]
+; CHECK-NEXT:    add x8, sp, #312
 ; CHECK-NEXT:    mov v4.b[7], w7
-; CHECK-NEXT:    ld1 { v17.b }[4], [x8]
+; CHECK-NEXT:    add x9, sp, #256
+; CHECK-NEXT:    add x10, sp, #200
+; CHECK-NEXT:    ld1 { v7.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[5], [x9]
+; CHECK-NEXT:    add x14, sp, #72
+; CHECK-NEXT:    ld1 { v1.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #432
+; CHECK-NEXT:    add x8, sp, #392
+; CHECK-NEXT:    ld1 { v6.b }[7], [x14]
+; CHECK-NEXT:    ld1 { v3.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #320
+; CHECK-NEXT:    add x9, sp, #264
+; CHECK-NEXT:    sshll v21.8h, v4.8b, #0
+; CHECK-NEXT:    ldr b4, [sp, #208]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
+; CHECK-NEXT:    add x10, sp, #440
+; CHECK-NEXT:    add x8, sp, #400
+; CHECK-NEXT:    sshll v16.8h, v6.8b, #0
+; CHECK-NEXT:    sshll v6.8h, v4.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #272
+; CHECK-NEXT:    add x9, sp, #328
+; CHECK-NEXT:    ldr b4, [sp, #608]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #616
 ; CHECK-NEXT:    add x10, sp, #448
-; CHECK-NEXT:    ldr b6, [sp, #208]
-; CHECK-NEXT:    ld1 { v5.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #624
-; CHECK-NEXT:    ldr b7, [sp, #472]
-; CHECK-NEXT:    ld1 { v22.b }[2], [x8]
-; CHECK-NEXT:    ld1 { v17.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #328
-; CHECK-NEXT:    sshll v20.8h, v4.8b, #0
-; CHECK-NEXT:    ldr b4, [sp, #480]
+; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
+; CHECK-NEXT:    ldr b18, [sp, #480]
+; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
+; CHECK-NEXT:    add x9, sp, #336
+; CHECK-NEXT:    ldr b17, [sp, #472]
+; CHECK-NEXT:    add x8, sp, #488
+; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #624
+; CHECK-NEXT:    ld1 { v18.b }[1], [x8]
+; CHECK-NEXT:    sshll v22.8h, v5.8b, #0
 ; CHECK-NEXT:    add x8, sp, #456
-; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #632
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    ld1 { v22.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #488
-; CHECK-NEXT:    ld1 { v17.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #336
-; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #640
-; CHECK-NEXT:    add x9, sp, #264
-; CHECK-NEXT:    ld1 { v22.b }[4], [x8]
+; CHECK-NEXT:    sshll v5.8h, v17.8b, #0
+; CHECK-NEXT:    ld1 { v4.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #496
-; CHECK-NEXT:    ld1 { v16.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v4.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #648
-; CHECK-NEXT:    smull v18.4s, v6.4h, v7.4h
-; CHECK-NEXT:    ldr b7, [sp, #544]
-; CHECK-NEXT:    add x9, sp, #272
-; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v22.b }[5], [x8]
+; CHECK-NEXT:    sshll v17.8h, v7.8b, #0
+; CHECK-NEXT:    add x10, sp, #632
+; CHECK-NEXT:    ld1 { v18.b }[2], [x8]
+; CHECK-NEXT:    add x9, sp, #464
 ; CHECK-NEXT:    add x8, sp, #504
-; CHECK-NEXT:    ld1 { v16.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #552
-; CHECK-NEXT:    add x9, sp, #656
-; CHECK-NEXT:    ld1 { v7.b }[1], [x8]
+; CHECK-NEXT:    smull v19.4s, v6.4h, v5.4h
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v4.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
+; CHECK-NEXT:    smull v6.4s, v16.4h, v17.4h
+; CHECK-NEXT:    add x9, sp, #640
+; CHECK-NEXT:    ld1 { v18.b }[3], [x8]
+; CHECK-NEXT:    smull2 v16.4s, v16.8h, v17.8h
+; CHECK-NEXT:    ldr b17, [sp, #672]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #680
+; CHECK-NEXT:    ldr b20, [sp, #544]
+; CHECK-NEXT:    mov v5.s[0], v19.s[0]
 ; CHECK-NEXT:    add x8, sp, #512
-; CHECK-NEXT:    ldr b21, [sp, #672]
-; CHECK-NEXT:    ld1 { v22.b }[6], [x9]
-; CHECK-NEXT:    mov v6.s[0], v18.s[0]
-; CHECK-NEXT:    add x9, sp, #664
-; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #560
-; CHECK-NEXT:    sshll v23.8h, v16.8b, #0
-; CHECK-NEXT:    ld1 { v7.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #520
-; CHECK-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #528
-; CHECK-NEXT:    add x10, sp, #464
-; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #568
-; CHECK-NEXT:    smull2 v18.4s, v20.8h, v23.8h
-; CHECK-NEXT:    ld1 { v7.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #680
-; CHECK-NEXT:    smlal v6.4s, v20.4h, v23.4h
-; CHECK-NEXT:    ld1 { v21.b }[1], [x8]
-; CHECK-NEXT:    sshll v20.8h, v22.8b, #0
-; CHECK-NEXT:    ldr b22, [sp, #736]
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    ldr b23, [sp, #1000]
-; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #688
-; CHECK-NEXT:    sshll v24.8h, v22.8b, #0
-; CHECK-NEXT:    ld1 { v21.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[1], [x9]
+; CHECK-NEXT:    add x11, sp, #552
+; CHECK-NEXT:    add x10, sp, #648
+; CHECK-NEXT:    ld1 { v18.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #688
+; CHECK-NEXT:    add x9, sp, #520
+; CHECK-NEXT:    ld1 { v17.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #560
+; CHECK-NEXT:    smull2 v7.4s, v21.8h, v22.8h
+; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
+; CHECK-NEXT:    smlal v5.4s, v21.4h, v22.4h
+; CHECK-NEXT:    ld1 { v20.b }[2], [x10]
+; CHECK-NEXT:    ldr b21, [sp, #736]
+; CHECK-NEXT:    ldr b22, [sp, #1000]
+; CHECK-NEXT:    add x8, sp, #656
 ; CHECK-NEXT:    add x9, sp, #696
-; CHECK-NEXT:    sshll v25.8h, v23.8b, #0
-; CHECK-NEXT:    add x8, sp, #536
-; CHECK-NEXT:    ldr b22, [sp, #872]
-; CHECK-NEXT:    ldr b23, [sp, #936]
-; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #584
-; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #880
-; CHECK-NEXT:    add x9, sp, #704
-; CHECK-NEXT:    smull v25.4s, v24.4h, v25.4h
-; CHECK-NEXT:    ldr b24, [sp, #744]
-; CHECK-NEXT:    ld1 { v22.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #944
-; CHECK-NEXT:    add x10, sp, #888
-; CHECK-NEXT:    ld1 { v21.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #752
-; CHECK-NEXT:    ld1 { v23.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[1], [x9]
-; CHECK-NEXT:    add x8, sp, #712
+; CHECK-NEXT:    add x11, sp, #568
+; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #528
+; CHECK-NEXT:    ld1 { v17.b }[3], [x9]
+; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
+; CHECK-NEXT:    sshll v24.8h, v22.8b, #0
+; CHECK-NEXT:    ld1 { v18.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[3], [x11]
+; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    ldr b23, [sp, #808]
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #576
+; CHECK-NEXT:    ldr b22, [sp, #744]
+; CHECK-NEXT:    add x11, sp, #816
+; CHECK-NEXT:    smull v24.4s, v21.4h, v24.4h
+; CHECK-NEXT:    ld1 { v18.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v20.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #752
+; CHECK-NEXT:    ld1 { v23.b }[1], [x11]
+; CHECK-NEXT:    add x9, sp, #712
+; CHECK-NEXT:    ld1 { v22.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #584
+; CHECK-NEXT:    add x10, sp, #824
+; CHECK-NEXT:    sshll v21.8h, v18.8b, #0
+; CHECK-NEXT:    ld1 { v20.b }[5], [x9]
 ; CHECK-NEXT:    add x9, sp, #760
-; CHECK-NEXT:    ld1 { v22.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #952
-; CHECK-NEXT:    mov v19.s[0], v25.s[0]
-; CHECK-NEXT:    ldr b25, [sp, #808]
+; CHECK-NEXT:    ldr b18, [sp, #936]
 ; CHECK-NEXT:    ld1 { v23.b }[2], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[5], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[2], [x9]
-; CHECK-NEXT:    add x8, sp, #816
-; CHECK-NEXT:    add x9, sp, #896
-; CHECK-NEXT:    ld1 { v25.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #960
-; CHECK-NEXT:    ld1 { v22.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #768
-; CHECK-NEXT:    ld1 { v23.b }[3], [x8]
-; CHECK-NEXT:    add x10, sp, #904
-; CHECK-NEXT:    ld1 { v24.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #824
-; CHECK-NEXT:    add x8, sp, #720
-; CHECK-NEXT:    ld1 { v25.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #968
-; CHECK-NEXT:    ld1 { v22.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #776
-; CHECK-NEXT:    ld1 { v23.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[6], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[4], [x10]
-; CHECK-NEXT:    add x8, sp, #832
-; CHECK-NEXT:    add x9, sp, #912
-; CHECK-NEXT:    ld1 { v25.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #976
-; CHECK-NEXT:    ld1 { v22.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #784
-; CHECK-NEXT:    ld1 { v23.b }[5], [x8]
-; CHECK-NEXT:    add x10, sp, #920
-; CHECK-NEXT:    ld1 { v24.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #840
-; CHECK-NEXT:    add x8, sp, #728
-; CHECK-NEXT:    ld1 { v25.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #984
-; CHECK-NEXT:    ld1 { v22.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #792
-; CHECK-NEXT:    ld1 { v23.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[6], [x10]
-; CHECK-NEXT:    add x8, sp, #848
-; CHECK-NEXT:    add x9, sp, #928
-; CHECK-NEXT:    ld1 { v25.b }[5], [x8]
-; CHECK-NEXT:    add x12, sp, #72
-; CHECK-NEXT:    add x8, sp, #992
-; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #800
-; CHECK-NEXT:    ld1 { v3.b }[7], [x12]
-; CHECK-NEXT:    ld1 { v23.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #592
-; CHECK-NEXT:    ld1 { v24.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #856
-; CHECK-NEXT:    ld1 { v7.b }[6], [x8]
-; CHECK-NEXT:    add x11, sp, #200
-; CHECK-NEXT:    ld1 { v25.b }[6], [x9]
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
+; CHECK-NEXT:    mov v19.s[0], v24.s[0]
+; CHECK-NEXT:    ldr b24, [sp, #872]
+; CHECK-NEXT:    ld1 { v22.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #944
+; CHECK-NEXT:    add x11, sp, #880
+; CHECK-NEXT:    add x10, sp, #768
+; CHECK-NEXT:    ld1 { v18.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #832
+; CHECK-NEXT:    ld1 { v24.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #952
+; CHECK-NEXT:    add x12, sp, #888
+; CHECK-NEXT:    add x9, sp, #592
+; CHECK-NEXT:    add x11, sp, #776
+; CHECK-NEXT:    ld1 { v18.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #840
+; CHECK-NEXT:    ld1 { v24.b }[2], [x12]
+; CHECK-NEXT:    ld1 { v23.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #960
+; CHECK-NEXT:    add x11, sp, #896
+; CHECK-NEXT:    add x10, sp, #784
+; CHECK-NEXT:    ld1 { v18.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #848
+; CHECK-NEXT:    ld1 { v24.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #968
+; CHECK-NEXT:    add x12, sp, #904
+; CHECK-NEXT:    add x9, sp, #600
+; CHECK-NEXT:    add x11, sp, #792
+; CHECK-NEXT:    ld1 { v18.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #856
+; CHECK-NEXT:    ld1 { v24.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v23.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #976
+; CHECK-NEXT:    add x11, sp, #912
+; CHECK-NEXT:    add x10, sp, #800
+; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #864
+; CHECK-NEXT:    ld1 { v24.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #720
+; CHECK-NEXT:    ld1 { v22.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #984
+; CHECK-NEXT:    ld1 { v17.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #920
+; CHECK-NEXT:    ld1 { v18.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v24.b }[6], [x9]
+; CHECK-NEXT:    add x10, sp, #728
+; CHECK-NEXT:    add x8, sp, #664
+; CHECK-NEXT:    sshll v20.8h, v20.8b, #0
 ; CHECK-NEXT:    sshll v22.8h, v22.8b, #0
 ; CHECK-NEXT:    sshll v23.8h, v23.8b, #0
-; CHECK-NEXT:    add x8, sp, #600
-; CHECK-NEXT:    sshll v24.8h, v24.8b, #0
-; CHECK-NEXT:    add x9, sp, #864
-; CHECK-NEXT:    ld1 { v2.b }[7], [x11]
-; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v25.b }[7], [x9]
-; CHECK-NEXT:    smull v16.4s, v3.4h, v5.4h
-; CHECK-NEXT:    smull2 v3.4s, v3.8h, v5.8h
-; CHECK-NEXT:    smull v5.4s, v21.4h, v23.4h
-; CHECK-NEXT:    smull2 v21.4s, v21.8h, v23.8h
-; CHECK-NEXT:    smull2 v23.4s, v20.8h, v22.8h
-; CHECK-NEXT:    smlal v19.4s, v4.4h, v24.4h
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
+; CHECK-NEXT:    add x9, sp, #992
+; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #928
+; CHECK-NEXT:    ld1 { v18.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v24.b }[7], [x10]
+; CHECK-NEXT:    smlal v19.4s, v21.4h, v22.4h
+; CHECK-NEXT:    smull2 v21.4s, v21.8h, v22.8h
+; CHECK-NEXT:    smull v22.4s, v20.4h, v23.4h
+; CHECK-NEXT:    smull2 v20.4s, v20.8h, v23.8h
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    sshll v25.8h, v25.8b, #0
-; CHECK-NEXT:    smlal2 v3.4s, v2.8h, v17.8h
-; CHECK-NEXT:    smlal v16.4s, v2.4h, v17.4h
-; CHECK-NEXT:    smlal2 v23.4s, v4.8h, v24.8h
-; CHECK-NEXT:    smlal2 v18.4s, v0.8h, v1.8h
-; CHECK-NEXT:    smlal v6.4s, v0.4h, v1.4h
-; CHECK-NEXT:    smlal v19.4s, v20.4h, v22.4h
-; CHECK-NEXT:    smlal2 v21.4s, v7.8h, v25.8h
-; CHECK-NEXT:    smlal v5.4s, v7.4h, v25.4h
-; CHECK-NEXT:    add v0.4s, v18.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v16.4s
-; CHECK-NEXT:    add v2.4s, v23.4s, v21.4s
-; CHECK-NEXT:    add v3.4s, v19.4s, v5.4s
+; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
+; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-NEXT:    sshll v23.8h, v24.8b, #0
+; CHECK-NEXT:    smlal2 v16.4s, v1.8h, v3.8h
+; CHECK-NEXT:    smlal v6.4s, v1.4h, v3.4h
+; CHECK-NEXT:    smlal2 v7.4s, v0.8h, v2.8h
+; CHECK-NEXT:    smlal v5.4s, v0.4h, v2.4h
+; CHECK-NEXT:    smlal2 v20.4s, v17.8h, v18.8h
+; CHECK-NEXT:    smlal v22.4s, v17.4h, v18.4h
+; CHECK-NEXT:    smlal2 v21.4s, v4.8h, v23.8h
+; CHECK-NEXT:    smlal v19.4s, v4.4h, v23.4h
+; CHECK-NEXT:    add v0.4s, v7.4s, v16.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v6.4s
+; CHECK-NEXT:    add v2.4s, v21.4s, v20.4s
+; CHECK-NEXT:    add v3.4s, v19.4s, v22.4s
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -2050,10 +2050,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
 ; CHECK-NEXT:    ld1 { v5.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #176
-; CHECK-NEXT:    ldr b6, [sp, #544]
+; CHECK-NEXT:    ldr b6, [sp, #672]
 ; CHECK-NEXT:    ld1 { v0.b }[4], [x12]
-; CHECK-NEXT:    add x14, sp, #552
-; CHECK-NEXT:    ldr b7, [sp, #672]
+; CHECK-NEXT:    add x14, sp, #680
+; CHECK-NEXT:    ldr b7, [sp, #544]
 ; CHECK-NEXT:    ld1 { v2.b }[4], [x8]
 ; CHECK-NEXT:    add x13, sp, #40
 ; CHECK-NEXT:    ld1 { v6.b }[1], [x14]
@@ -2061,7 +2061,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    add x11, sp, #128
 ; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
 ; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #680
+; CHECK-NEXT:    add x9, sp, #552
 ; CHECK-NEXT:    add x13, sp, #184
 ; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
 ; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
@@ -2070,26 +2070,26 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
 ; CHECK-NEXT:    add x10, sp, #136
 ; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #560
+; CHECK-NEXT:    add x11, sp, #688
 ; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
 ; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #688
+; CHECK-NEXT:    add x11, sp, #560
 ; CHECK-NEXT:    mov v1.b[3], w3
 ; CHECK-NEXT:    ld1 { v7.b }[2], [x11]
 ; CHECK-NEXT:    add x9, sp, #632
 ; CHECK-NEXT:    add x11, sp, #512
 ; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
 ; CHECK-NEXT:    ld1 { v4.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #568
-; CHECK-NEXT:    add x10, sp, #696
+; CHECK-NEXT:    add x9, sp, #696
+; CHECK-NEXT:    add x10, sp, #568
 ; CHECK-NEXT:    ld1 { v6.b }[3], [x9]
 ; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
 ; CHECK-NEXT:    ld1 { v7.b }[3], [x10]
 ; CHECK-NEXT:    add x9, sp, #640
 ; CHECK-NEXT:    mov v1.b[4], w4
 ; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    add x9, sp, #704
+; CHECK-NEXT:    add x10, sp, #576
 ; CHECK-NEXT:    add x11, sp, #520
 ; CHECK-NEXT:    ld1 { v6.b }[4], [x9]
 ; CHECK-NEXT:    ldr b18, [sp, #736]
@@ -2101,8 +2101,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    add x9, sp, #648
 ; CHECK-NEXT:    ld1 { v3.b }[4], [x8]
 ; CHECK-NEXT:    add x10, sp, #528
-; CHECK-NEXT:    add x11, sp, #584
-; CHECK-NEXT:    add x12, sp, #712
+; CHECK-NEXT:    add x11, sp, #712
+; CHECK-NEXT:    add x12, sp, #584
 ; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
 ; CHECK-NEXT:    mov v1.b[5], w5
 ; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
@@ -2114,8 +2114,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
 ; CHECK-NEXT:    add x9, sp, #656
 ; CHECK-NEXT:    add x10, sp, #536
-; CHECK-NEXT:    add x11, sp, #592
-; CHECK-NEXT:    add x12, sp, #720
+; CHECK-NEXT:    add x11, sp, #720
+; CHECK-NEXT:    add x12, sp, #592
 ; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
 ; CHECK-NEXT:    ldr b16, [sp, #208]
 ; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
@@ -2127,8 +2127,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
 ; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #664
-; CHECK-NEXT:    add x9, sp, #600
-; CHECK-NEXT:    add x10, sp, #728
+; CHECK-NEXT:    add x9, sp, #728
+; CHECK-NEXT:    add x10, sp, #600
 ; CHECK-NEXT:    mov v17.s[0], v18.s[0]
 ; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
 ; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
@@ -2151,7 +2151,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
 ; CHECK-NEXT:    saddl2 v16.4s, v7.8h, v6.8h
-; CHECK-NEXT:    saddl2 v5.4s, v4.8h, v5.8h
+; CHECK-NEXT:    saddl2 v5.4s, v5.8h, v4.8h
 ; CHECK-NEXT:    saddl v6.4s, v7.4h, v6.4h
 ; CHECK-NEXT:    saddw v4.4s, v17.4s, v4.4h
 ; CHECK-NEXT:    saddl2 v17.4s, v1.8h, v0.8h
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 161c18cec17ce4..42ea425f99c0a6 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1861,12 +1861,10 @@ define i64 @muland_demand(i64 %x) nounwind {
 ;
 ; RV32IM-LABEL: muland_demand:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    slli a1, a1, 2
-; RV32IM-NEXT:    srli a1, a1, 2
 ; RV32IM-NEXT:    andi a0, a0, -8
 ; RV32IM-NEXT:    li a2, 12
-; RV32IM-NEXT:    mulhu a3, a0, a2
 ; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    mulhu a3, a0, a2
 ; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    mul a0, a0, a2
 ; RV32IM-NEXT:    ret
@@ -1881,9 +1879,7 @@ define i64 @muland_demand(i64 %x) nounwind {
 ;
 ; RV64IM-LABEL: muland_demand:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    li a1, -29
-; RV64IM-NEXT:    srli a1, a1, 2
-; RV64IM-NEXT:    and a0, a0, a1
+; RV64IM-NEXT:    andi a0, a0, -8
 ; RV64IM-NEXT:    li a1, 12
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -1898,7 +1894,6 @@ define i64 @mulzext_demand(i32 signext %x) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    li a2, 0
 ; RV32I-NEXT:    call __muldi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1914,16 +1909,12 @@ define i64 @mulzext_demand(i32 signext %x) nounwind {
 ;
 ; RV64I-LABEL: mulzext_demand:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 3
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    tail __muldi3
 ;
 ; RV64IM-LABEL: mulzext_demand:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a0, a0, 32
-; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    li a1, 3
 ; RV64IM-NEXT:    slli a1, a1, 32
 ; RV64IM-NEXT:    mul a0, a0, a1
@@ -1936,17 +1927,13 @@ define i64 @mulzext_demand(i32 signext %x) nounwind {
 define i32 @mulfshl_demand(i32 signext %x) nounwind {
 ; RV32I-LABEL: mulfshl_demand:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    srli a1, a0, 11
-; RV32I-NEXT:    slli a0, a0, 21
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 11
 ; RV32I-NEXT:    lui a1, 92808
 ; RV32I-NEXT:    tail __mulsi3
 ;
 ; RV32IM-LABEL: mulfshl_demand:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    srli a1, a0, 11
-; RV32IM-NEXT:    slli a0, a0, 21
-; RV32IM-NEXT:    or a0, a0, a1
+; RV32IM-NEXT:    srli a0, a0, 11
 ; RV32IM-NEXT:    lui a1, 92808
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -1955,9 +1942,7 @@ define i32 @mulfshl_demand(i32 signext %x) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srliw a1, a0, 11
-; RV64I-NEXT:    slliw a0, a0, 21
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 11
 ; RV64I-NEXT:    lui a1, 92808
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1966,9 +1951,7 @@ define i32 @mulfshl_demand(i32 signext %x) nounwind {
 ;
 ; RV64IM-LABEL: mulfshl_demand:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    srliw a1, a0, 11
-; RV64IM-NEXT:    slli a0, a0, 21
-; RV64IM-NEXT:    or a0, a0, a1
+; RV64IM-NEXT:    srliw a0, a0, 11
 ; RV64IM-NEXT:    lui a1, 92808
 ; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -1980,26 +1963,11 @@ define i32 @mulfshl_demand(i32 signext %x) nounwind {
 define i32 @mulor_demand(i32 signext %x, i32 signext %y) nounwind {
 ; RV32I-LABEL: mulor_demand:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a1, 2560
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    call __mulsi3
-; RV32I-NEXT:    or a0, a0, s0
 ; RV32I-NEXT:    lui a1, 92808
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    tail __mulsi3
 ;
 ; RV32IM-LABEL: mulor_demand:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lui a2, 2560
-; RV32IM-NEXT:    mul a1, a1, a2
-; RV32IM-NEXT:    or a0, a1, a0
 ; RV32IM-NEXT:    lui a1, 92808
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -2008,25 +1976,14 @@ define i32 @mulor_demand(i32 signext %x, i32 signext %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lui a1, 2560
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    or a0, a0, s0
 ; RV64I-NEXT:    lui a1, 92808
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: mulor_demand:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, 2560
-; RV64IM-NEXT:    mul a1, a1, a2
-; RV64IM-NEXT:    or a0, a1, a0
 ; RV64IM-NEXT:    lui a1, 92808
 ; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
index 9f06a9dd124cef..ad9e763a6a0af1 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
@@ -638,8 +638,6 @@ define i64 @zext_mul288(i32 signext %a) {
 define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 3
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -647,7 +645,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -661,8 +658,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 define i64 @zext_mul21474836480(i32 signext %a) {
 ; RV64I-LABEL: zext_mul21474836480:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -670,7 +665,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh2add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -684,8 +678,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 define i64 @zext_mul38654705664(i32 signext %a) {
 ; RV64I-LABEL: zext_mul38654705664:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 9
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -693,7 +685,6 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh3add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 0efc45b99289a7..e9a160a2b9bd4c 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -855,8 +855,6 @@ define i64 @zext_mul288(i32 signext %a) {
 define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 3
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -864,7 +862,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -878,8 +875,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 define i64 @zext_mul21474836480(i32 signext %a) {
 ; RV64I-LABEL: zext_mul21474836480:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -887,7 +882,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh2add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -901,8 +895,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 define i64 @zext_mul38654705664(i32 signext %a) {
 ; RV64I-LABEL: zext_mul38654705664:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 9
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -910,7 +902,6 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh3add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index f707cb31e3eced..8cf78551d28f98 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -1047,25 +1047,25 @@ define signext i32 @bug(i32 signext %x) {
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 3
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -8
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    srliw a2, a1, 28
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 2
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -4
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    srliw a2, a1, 30
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 1
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -2
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    srai a1, a1, 31
 ; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    srli a1, a1, 31
 ; CHECK-NEXT:    addw a0, a0, a1
 ; CHECK-NEXT:  .LBB18_4: # %cleanup
 ; CHECK-NEXT:    ret
@@ -1087,28 +1087,27 @@ define signext i32 @bug(i32 signext %x) {
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 3
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -8
 ; NOREMOVAL-NEXT:    add a0, a0, a2
 ; NOREMOVAL-NEXT:    srliw a2, a1, 28
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 2
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -4
 ; NOREMOVAL-NEXT:    add a0, a0, a2
 ; NOREMOVAL-NEXT:    srliw a2, a1, 30
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 1
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -2
 ; NOREMOVAL-NEXT:    add a0, a0, a2
-; NOREMOVAL-NEXT:    srai a1, a1, 31
 ; NOREMOVAL-NEXT:    not a1, a1
-; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    srli a1, a1, 31
+; NOREMOVAL-NEXT:    addw a0, a0, a1
 ; NOREMOVAL-NEXT:  .LBB18_4: # %cleanup
-; NOREMOVAL-NEXT:    sext.w a0, a0
 ; NOREMOVAL-NEXT:    ret
 entry:
   %tobool.not = icmp eq i32 %x, 0
diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index 49ce2455ae8c7a..4ed00a9d66bd35 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -329,7 +329,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psrad $3, %xmm2
-; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -351,7 +351,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
-; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index dcded7a877abb1..1f82c4a5a2d92b 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1173,13 +1173,14 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
 ;
 ; SSE41-LABEL: mul_v4i64_zero_lower:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
 ; SSE41-NEXT:    psrlq $32, %xmm2
-; SSE41-NEXT:    pmuludq %xmm3, %xmm2
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
 ; SSE41-NEXT:    psrlq $32, %xmm1
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    pmuludq %xmm1, %xmm0
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
+; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: mul_v4i64_zero_lower:
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 62051d17099403..f3f7f0515e3060 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1863,7 +1863,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    psllq $32, %xmm0
 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
@@ -1884,7 +1884,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    psrld $16, %xmm0
+; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE-NEXT:    psllq $32, %xmm0
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)



More information about the llvm-commits mailing list