[llvm] [RISCV] Porting hasAllNBitUsers to RISCV GISel for instruction select (PR #124678)
Luke Quinn via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 3 06:02:08 PST 2025
https://github.com/lquinn2015 updated https://github.com/llvm/llvm-project/pull/124678
>From 84380ea5a42784e923069a8812bca3854b846be1 Mon Sep 17 00:00:00 2001
From: Luke Quinn <quic_lquinn at quicinc.com>
Date: Mon, 3 Feb 2025 05:54:51 -0800
Subject: [PATCH 1/3] [RISCV] PreTest: RISCVOptWInst Pass for GIsel, Add 3
tests required to show pre checks
Signed-off-by: Luke Quinn <quic_lquinn at quicinc.com>
---
.../RISCV/GlobalISel/div-by-constant.ll | 811 ++
.../CodeGen/RISCV/GlobalISel/rotl-rotr.ll | 3412 +++++
llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll | 962 ++
...lar-shift-by-byte-multiple-legalization.ll | 10982 ++++++++++++++++
4 files changed, 16167 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
new file mode 100644
index 000000000000000..e3616a79add9f29
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -0,0 +1,811 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck -check-prefixes=RV32,RV32IM %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zba,+zbb \
+; RUN: -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck -check-prefixes=RV32,RV32IMZB %s
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck -check-prefixes=RV64,RV64IM %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zba,+zbb \
+; RUN: -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck -check-prefixes=RV64,RV64IMZB %s
+
+; Test that there is a single shift after the mul and no addition.
+define i32 @udiv_constant_no_add(i32 %a) nounwind {
+; RV32-LABEL: udiv_constant_no_add:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a1, 838861
+; RV32-NEXT: addi a1, a1, -819
+; RV32-NEXT: mulhu a0, a0, a1
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: ret
+;
+; RV64IM-LABEL: udiv_constant_no_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: lui a1, 205
+; RV64IM-NEXT: srli a0, a0, 32
+; RV64IM-NEXT: addiw a1, a1, -819
+; RV64IM-NEXT: slli a1, a1, 12
+; RV64IM-NEXT: addi a1, a1, -819
+; RV64IM-NEXT: mul a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 34
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv_constant_no_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: zext.w a0, a0
+; RV64IMZB-NEXT: lui a1, 838861
+; RV64IMZB-NEXT: addi a1, a1, -819
+; RV64IMZB-NEXT: zext.w a1, a1
+; RV64IMZB-NEXT: mul a0, a0, a1
+; RV64IMZB-NEXT: srli a0, a0, 34
+; RV64IMZB-NEXT: ret
+ %1 = udiv i32 %a, 5
+ ret i32 %1
+}
+
+; This constant requires a sub, shrli, add sequence after the mul.
+define i32 @udiv_constant_add(i32 %a) nounwind {
+; RV32-LABEL: udiv_constant_add:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a1, 149797
+; RV32-NEXT: addi a1, a1, -1755
+; RV32-NEXT: mulhu a1, a0, a1
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: srli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: ret
+;
+; RV64IM-LABEL: udiv_constant_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: lui a1, 149797
+; RV64IM-NEXT: slli a2, a0, 32
+; RV64IM-NEXT: addiw a1, a1, -1755
+; RV64IM-NEXT: srli a2, a2, 32
+; RV64IM-NEXT: mul a1, a2, a1
+; RV64IM-NEXT: srli a1, a1, 32
+; RV64IM-NEXT: sub a0, a0, a1
+; RV64IM-NEXT: srliw a0, a0, 1
+; RV64IM-NEXT: add a0, a0, a1
+; RV64IM-NEXT: srliw a0, a0, 2
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv_constant_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: lui a1, 149797
+; RV64IMZB-NEXT: addiw a1, a1, -1755
+; RV64IMZB-NEXT: zext.w a2, a0
+; RV64IMZB-NEXT: mul a1, a2, a1
+; RV64IMZB-NEXT: srli a1, a1, 32
+; RV64IMZB-NEXT: sub a0, a0, a1
+; RV64IMZB-NEXT: srliw a0, a0, 1
+; RV64IMZB-NEXT: add a0, a0, a1
+; RV64IMZB-NEXT: srliw a0, a0, 2
+; RV64IMZB-NEXT: ret
+ %1 = udiv i32 %a, 7
+ ret i32 %1
+}
+
+define i64 @udiv64_constant_no_add(i64 %a) nounwind {
+; RV32-LABEL: udiv64_constant_no_add:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 838861
+; RV32-NEXT: mulhu a3, a0, zero
+; RV32-NEXT: addi a4, a2, -819
+; RV32-NEXT: addi a2, a2, -820
+; RV32-NEXT: mul a5, a1, a4
+; RV32-NEXT: mul a6, a0, a2
+; RV32-NEXT: mulhu a7, a0, a4
+; RV32-NEXT: mul t0, zero, a4
+; RV32-NEXT: mul t1, a1, a2
+; RV32-NEXT: mulhu t2, a1, a4
+; RV32-NEXT: mulhu a0, a0, a2
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: mul a2, zero, a2
+; RV32-NEXT: mulhu a4, zero, a4
+; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add a2, t0, a2
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: add a1, a4, a1
+; RV32-NEXT: sltu a4, a5, a6
+; RV32-NEXT: add a5, a5, a7
+; RV32-NEXT: sltu a6, t0, t1
+; RV32-NEXT: sltiu t1, t0, 0
+; RV32-NEXT: add t0, t0, t2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: sltu a2, a5, a7
+; RV32-NEXT: add a6, a6, t1
+; RV32-NEXT: sltu a5, t0, t2
+; RV32-NEXT: add t0, t0, a0
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: add a2, a4, a2
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: sltu a0, t0, a0
+; RV32-NEXT: add a0, a5, a0
+; RV32-NEXT: add t0, t0, a2
+; RV32-NEXT: sltu a2, t0, a2
+; RV32-NEXT: srli a3, t0, 2
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a1, 30
+; RV32-NEXT: or a0, a3, a0
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv64_constant_no_add:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 1035469
+; RV64-NEXT: addi a1, a1, -819
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -819
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -819
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -819
+; RV64-NEXT: mulhu a0, a0, a1
+; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: ret
+ %1 = udiv i64 %a, 5
+ ret i64 %1
+}
+
+define i64 @udiv64_constant_add(i64 %a) nounwind {
+; RV32-LABEL: udiv64_constant_add:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 599186
+; RV32-NEXT: lui a3, 149797
+; RV32-NEXT: mulhu a4, a0, zero
+; RV32-NEXT: addi a2, a2, 1171
+; RV32-NEXT: addi a3, a3, -1756
+; RV32-NEXT: mul a5, a1, a2
+; RV32-NEXT: mul a6, a0, a3
+; RV32-NEXT: mulhu a7, a0, a2
+; RV32-NEXT: mul t0, zero, a2
+; RV32-NEXT: mulhu t1, zero, a2
+; RV32-NEXT: mulhu t2, a1, a3
+; RV32-NEXT: add t1, t1, t2
+; RV32-NEXT: mul t2, zero, a3
+; RV32-NEXT: add t2, t0, t2
+; RV32-NEXT: add t1, t2, t1
+; RV32-NEXT: mul t2, a1, a3
+; RV32-NEXT: mulhu a2, a1, a2
+; RV32-NEXT: mulhu a3, a0, a3
+; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add t0, t0, t2
+; RV32-NEXT: sltu a6, a5, a6
+; RV32-NEXT: add a5, a5, a7
+; RV32-NEXT: sltu t2, t0, t2
+; RV32-NEXT: sltu a5, a5, a7
+; RV32-NEXT: sltiu a7, t0, 0
+; RV32-NEXT: add t0, t0, a2
+; RV32-NEXT: add a7, t2, a7
+; RV32-NEXT: sltu a2, t0, a2
+; RV32-NEXT: add t0, t0, a3
+; RV32-NEXT: add a4, t1, a4
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: add a2, a7, a2
+; RV32-NEXT: sltu a3, t0, a3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add t0, t0, a5
+; RV32-NEXT: sltu a3, t0, a5
+; RV32-NEXT: sub a5, a0, t0
+; RV32-NEXT: sltu a0, a0, t0
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: srli a5, a5, 1
+; RV32-NEXT: add a2, a4, a2
+; RV32-NEXT: sub a1, a1, a2
+; RV32-NEXT: slli a0, a1, 31
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: or a0, a5, a0
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a0, a0, t0
+; RV32-NEXT: sltu a2, a0, t0
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 30
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv64_constant_add:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a1, %hi(.LCPI3_0)
+; RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
+; RV64-NEXT: mulhu a1, a0, a1
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: srli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: ret
+ %1 = udiv i64 %a, 7
+ ret i64 %1
+}
+
+define i8 @udiv8_constant_no_add(i8 %a) nounwind {
+; RV32-LABEL: udiv8_constant_no_add:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: li a1, 205
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: srli a0, a0, 10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv8_constant_no_add:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: li a1, 205
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 10
+; RV64-NEXT: ret
+ %1 = udiv i8 %a, 5
+ ret i8 %1
+}
+
+define i8 @udiv8_constant_add(i8 %a) nounwind {
+; RV32-LABEL: udiv8_constant_add:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 37
+; RV32-NEXT: andi a2, a0, 255
+; RV32-NEXT: mul a1, a2, a1
+; RV32-NEXT: srli a1, a1, 8
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: srli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv8_constant_add:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 37
+; RV64-NEXT: andi a2, a0, 255
+; RV64-NEXT: mul a1, a2, a1
+; RV64-NEXT: srli a1, a1, 8
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: srli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: ret
+ %1 = udiv i8 %a, 7
+ ret i8 %1
+}
+
+define i16 @udiv16_constant_no_add(i16 %a) nounwind {
+; RV32IM-LABEL: udiv16_constant_no_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: lui a1, 13
+; RV32IM-NEXT: srli a0, a0, 16
+; RV32IM-NEXT: addi a1, a1, -819
+; RV32IM-NEXT: mul a0, a0, a1
+; RV32IM-NEXT: srli a0, a0, 18
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv16_constant_no_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: zext.h a0, a0
+; RV32IMZB-NEXT: lui a1, 13
+; RV32IMZB-NEXT: addi a1, a1, -819
+; RV32IMZB-NEXT: mul a0, a0, a1
+; RV32IMZB-NEXT: srli a0, a0, 18
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: udiv16_constant_no_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: lui a1, 13
+; RV64IM-NEXT: srli a0, a0, 48
+; RV64IM-NEXT: addiw a1, a1, -819
+; RV64IM-NEXT: mul a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 18
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv16_constant_no_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: zext.h a0, a0
+; RV64IMZB-NEXT: lui a1, 13
+; RV64IMZB-NEXT: addiw a1, a1, -819
+; RV64IMZB-NEXT: mul a0, a0, a1
+; RV64IMZB-NEXT: srli a0, a0, 18
+; RV64IMZB-NEXT: ret
+ %1 = udiv i16 %a, 5
+ ret i16 %1
+}
+
+define i16 @udiv16_constant_add(i16 %a) nounwind {
+; RV32IM-LABEL: udiv16_constant_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: lui a1, 2
+; RV32IM-NEXT: lui a2, 16
+; RV32IM-NEXT: addi a1, a1, 1171
+; RV32IM-NEXT: addi a2, a2, -1
+; RV32IM-NEXT: and a3, a0, a2
+; RV32IM-NEXT: mul a1, a3, a1
+; RV32IM-NEXT: srli a1, a1, 16
+; RV32IM-NEXT: sub a0, a0, a1
+; RV32IM-NEXT: and a0, a0, a2
+; RV32IM-NEXT: srli a0, a0, 1
+; RV32IM-NEXT: add a0, a0, a1
+; RV32IM-NEXT: and a0, a0, a2
+; RV32IM-NEXT: srli a0, a0, 2
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv16_constant_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: lui a1, 2
+; RV32IMZB-NEXT: addi a1, a1, 1171
+; RV32IMZB-NEXT: zext.h a2, a0
+; RV32IMZB-NEXT: mul a1, a2, a1
+; RV32IMZB-NEXT: srli a1, a1, 16
+; RV32IMZB-NEXT: sub a0, a0, a1
+; RV32IMZB-NEXT: zext.h a0, a0
+; RV32IMZB-NEXT: srli a0, a0, 1
+; RV32IMZB-NEXT: add a0, a0, a1
+; RV32IMZB-NEXT: zext.h a0, a0
+; RV32IMZB-NEXT: srli a0, a0, 2
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: udiv16_constant_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: lui a1, 2
+; RV64IM-NEXT: lui a2, 16
+; RV64IM-NEXT: addiw a1, a1, 1171
+; RV64IM-NEXT: addiw a2, a2, -1
+; RV64IM-NEXT: and a3, a0, a2
+; RV64IM-NEXT: mul a1, a3, a1
+; RV64IM-NEXT: srli a1, a1, 16
+; RV64IM-NEXT: sub a0, a0, a1
+; RV64IM-NEXT: and a0, a0, a2
+; RV64IM-NEXT: srli a0, a0, 1
+; RV64IM-NEXT: add a0, a0, a1
+; RV64IM-NEXT: and a0, a0, a2
+; RV64IM-NEXT: srli a0, a0, 2
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv16_constant_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: lui a1, 2
+; RV64IMZB-NEXT: addi a1, a1, 1171
+; RV64IMZB-NEXT: zext.h a2, a0
+; RV64IMZB-NEXT: mul a1, a2, a1
+; RV64IMZB-NEXT: srli a1, a1, 16
+; RV64IMZB-NEXT: sub a0, a0, a1
+; RV64IMZB-NEXT: zext.h a0, a0
+; RV64IMZB-NEXT: srli a0, a0, 1
+; RV64IMZB-NEXT: add a0, a0, a1
+; RV64IMZB-NEXT: zext.h a0, a0
+; RV64IMZB-NEXT: srli a0, a0, 2
+; RV64IMZB-NEXT: ret
+ %1 = udiv i16 %a, 7
+ ret i16 %1
+}
+
+; Test the simplest case a srli and an add after the mul. No srai.
+define i32 @sdiv_constant_no_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_no_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 3
+; RV32-NEXT: div a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv_constant_no_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 3
+; RV64-NEXT: divw a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i32 %a, 3
+ ret i32 %1
+}
+
+; This constant requires an srai between the mul and the add.
+define i32 @sdiv_constant_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 5
+; RV32-NEXT: div a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv_constant_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 5
+; RV64-NEXT: divw a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i32 %a, 5
+ ret i32 %1
+}
+
+; This constant requires an add and an srai after the mul.
+define i32 @sdiv_constant_add_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_add_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 7
+; RV32-NEXT: div a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv_constant_add_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 7
+; RV64-NEXT: divw a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i32 %a, 7
+ ret i32 %1
+}
+
+; This constant requires a sub and an srai after the mul.
+define i32 @sdiv_constant_sub_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_sub_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, -7
+; RV32-NEXT: div a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv_constant_sub_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, -7
+; RV64-NEXT: divw a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i32 %a, -7
+ ret i32 %1
+}
+
+define i64 @sdiv64_constant_no_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_no_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li a2, 3
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __divdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv64_constant_no_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 3
+; RV64-NEXT: div a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i64 %a, 3
+ ret i64 %1
+}
+
+define i64 @sdiv64_constant_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li a2, 5
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __divdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv64_constant_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 5
+; RV64-NEXT: div a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i64 %a, 5
+ ret i64 %1
+}
+
+define i64 @sdiv64_constant_add_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_add_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li a2, 15
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __divdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv64_constant_add_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 15
+; RV64-NEXT: div a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i64 %a, 15
+ ret i64 %1
+}
+
+define i64 @sdiv64_constant_sub_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_sub_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li a2, -3
+; RV32-NEXT: li a3, -1
+; RV32-NEXT: call __divdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv64_constant_sub_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, -3
+; RV64-NEXT: div a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i64 %a, -3
+ ret i64 %1
+}
+
+define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_no_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 3
+; RV32IM-NEXT: slli a0, a0, 24
+; RV32IM-NEXT: srai a0, a0, 24
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_no_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 3
+; RV32IMZB-NEXT: sext.b a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv8_constant_no_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 3
+; RV64IM-NEXT: slli a0, a0, 56
+; RV64IM-NEXT: srai a0, a0, 56
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_no_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 3
+; RV64IMZB-NEXT: sext.b a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i8 %a, 3
+ ret i8 %1
+}
+
+define i8 @sdiv8_constant_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 5
+; RV32IM-NEXT: slli a0, a0, 24
+; RV32IM-NEXT: srai a0, a0, 24
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 5
+; RV32IMZB-NEXT: sext.b a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv8_constant_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 5
+; RV64IM-NEXT: slli a0, a0, 56
+; RV64IM-NEXT: srai a0, a0, 56
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 5
+; RV64IMZB-NEXT: sext.b a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i8 %a, 5
+ ret i8 %1
+}
+
+define i8 @sdiv8_constant_add_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_add_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 7
+; RV32IM-NEXT: slli a0, a0, 24
+; RV32IM-NEXT: srai a0, a0, 24
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_add_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 7
+; RV32IMZB-NEXT: sext.b a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv8_constant_add_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 7
+; RV64IM-NEXT: slli a0, a0, 56
+; RV64IM-NEXT: srai a0, a0, 56
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_add_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 7
+; RV64IMZB-NEXT: sext.b a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i8 %a, 7
+ ret i8 %1
+}
+
+define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_sub_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, -7
+; RV32IM-NEXT: slli a0, a0, 24
+; RV32IM-NEXT: srai a0, a0, 24
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_sub_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, -7
+; RV32IMZB-NEXT: sext.b a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv8_constant_sub_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, -7
+; RV64IM-NEXT: slli a0, a0, 56
+; RV64IM-NEXT: srai a0, a0, 56
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_sub_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, -7
+; RV64IMZB-NEXT: sext.b a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i8 %a, -7
+ ret i8 %1
+}
+
+define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_no_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 3
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srai a0, a0, 16
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_no_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 3
+; RV32IMZB-NEXT: sext.h a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv16_constant_no_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 3
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srai a0, a0, 48
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_no_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 3
+; RV64IMZB-NEXT: sext.h a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i16 %a, 3
+ ret i16 %1
+}
+
+define i16 @sdiv16_constant_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 5
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srai a0, a0, 16
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 5
+; RV32IMZB-NEXT: sext.h a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv16_constant_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 5
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srai a0, a0, 48
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 5
+; RV64IMZB-NEXT: sext.h a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i16 %a, 5
+ ret i16 %1
+}
+
+define i16 @sdiv16_constant_add_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_add_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 15
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srai a0, a0, 16
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_add_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 15
+; RV32IMZB-NEXT: sext.h a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv16_constant_add_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 15
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srai a0, a0, 48
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_add_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 15
+; RV64IMZB-NEXT: sext.h a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i16 %a, 15
+ ret i16 %1
+}
+
+define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_sub_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, -15
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srai a0, a0, 16
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_sub_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, -15
+; RV32IMZB-NEXT: sext.h a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv16_constant_sub_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, -15
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srai a0, a0, 48
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_sub_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, -15
+; RV64IMZB-NEXT: sext.h a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i16 %a, -15
+ ret i16 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
new file mode 100644
index 000000000000000..46d1661983c6acb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
@@ -0,0 +1,3412 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -enable-legalize-types-checking -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV32ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV64ZBB
+; RUN: llc -mtriple=riscv32 -mattr=+xtheadbb -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV32XTHEADBB
+; RUN: llc -mtriple=riscv64 -mattr=+xtheadbb -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV64XTHEADBB
+
+; NOTE: -enable-legalize-types-checking is on one command line due to a previous
+; assertion failure on an expensive checks build for @rotr_32_mask_multiple.
+
+; These IR sequences are idioms for rotates. If rotate instructions are
+; supported, they will be turned into ISD::ROTL or ISD::ROTR.
+
+define i32 @rotl_32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: neg a2, a1
+; RV32I-NEXT: sll a1, a0, a1
+; RV32I-NEXT: srl a0, a0, a2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: sllw a1, a0, a1
+; RV64I-NEXT: srlw a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: rol a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rolw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: neg a2, a1
+; RV32XTHEADBB-NEXT: sll a1, a0, a1
+; RV32XTHEADBB-NEXT: srl a0, a0, a2
+; RV32XTHEADBB-NEXT: or a0, a1, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: sllw a1, a0, a1
+; RV64XTHEADBB-NEXT: srlw a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 32, %y
+ %b = shl i32 %x, %y
+ %c = lshr i32 %x, %z
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+define i32 @rotr_32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: neg a2, a1
+; RV32I-NEXT: srl a1, a0, a1
+; RV32I-NEXT: sll a0, a0, a2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srlw a1, a0, a1
+; RV64I-NEXT: sllw a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ror a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rorw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: neg a2, a1
+; RV32XTHEADBB-NEXT: srl a1, a0, a1
+; RV32XTHEADBB-NEXT: sll a0, a0, a2
+; RV32XTHEADBB-NEXT: or a0, a1, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: srlw a1, a0, a1
+; RV64XTHEADBB-NEXT: sllw a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 32, %y
+ %b = lshr i32 %x, %y
+ %c = shl i32 %x, %z
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+define i64 @rotl_64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotl_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a6, a2, 63
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a6, a4, .LBB2_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: sll a7, a0, a6
+; RV32I-NEXT: j .LBB2_3
+; RV32I-NEXT: .LBB2_2:
+; RV32I-NEXT: sll a3, a0, a2
+; RV32I-NEXT: neg a5, a6
+; RV32I-NEXT: srl a5, a0, a5
+; RV32I-NEXT: sll a7, a1, a2
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: .LBB2_3:
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: mv a2, a1
+; RV32I-NEXT: beqz a6, .LBB2_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a2, a7
+; RV32I-NEXT: .LBB2_5:
+; RV32I-NEXT: andi a6, a5, 63
+; RV32I-NEXT: bltu a6, a4, .LBB2_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl a7, a1, a6
+; RV32I-NEXT: bnez a6, .LBB2_8
+; RV32I-NEXT: j .LBB2_9
+; RV32I-NEXT: .LBB2_7:
+; RV32I-NEXT: srl a7, a0, a5
+; RV32I-NEXT: neg t0, a6
+; RV32I-NEXT: sll t0, a1, t0
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: beqz a6, .LBB2_9
+; RV32I-NEXT: .LBB2_8:
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: .LBB2_9:
+; RV32I-NEXT: bltu a6, a4, .LBB2_11
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB2_12
+; RV32I-NEXT: .LBB2_11:
+; RV32I-NEXT: srl a1, a1, a5
+; RV32I-NEXT: .LBB2_12:
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: sll a1, a0, a1
+; RV64I-NEXT: srl a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a6, a2, 63
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: bltu a6, a4, .LBB2_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: sll a7, a0, a6
+; RV32ZBB-NEXT: j .LBB2_3
+; RV32ZBB-NEXT: .LBB2_2:
+; RV32ZBB-NEXT: sll a3, a0, a2
+; RV32ZBB-NEXT: neg a5, a6
+; RV32ZBB-NEXT: srl a5, a0, a5
+; RV32ZBB-NEXT: sll a7, a1, a2
+; RV32ZBB-NEXT: or a7, a5, a7
+; RV32ZBB-NEXT: .LBB2_3:
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: mv a2, a1
+; RV32ZBB-NEXT: beqz a6, .LBB2_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a2, a7
+; RV32ZBB-NEXT: .LBB2_5:
+; RV32ZBB-NEXT: andi a6, a5, 63
+; RV32ZBB-NEXT: bltu a6, a4, .LBB2_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl a7, a1, a6
+; RV32ZBB-NEXT: bnez a6, .LBB2_8
+; RV32ZBB-NEXT: j .LBB2_9
+; RV32ZBB-NEXT: .LBB2_7:
+; RV32ZBB-NEXT: srl a7, a0, a5
+; RV32ZBB-NEXT: neg t0, a6
+; RV32ZBB-NEXT: sll t0, a1, t0
+; RV32ZBB-NEXT: or a7, a7, t0
+; RV32ZBB-NEXT: beqz a6, .LBB2_9
+; RV32ZBB-NEXT: .LBB2_8:
+; RV32ZBB-NEXT: mv a0, a7
+; RV32ZBB-NEXT: .LBB2_9:
+; RV32ZBB-NEXT: bltu a6, a4, .LBB2_11
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: j .LBB2_12
+; RV32ZBB-NEXT: .LBB2_11:
+; RV32ZBB-NEXT: srl a1, a1, a5
+; RV32ZBB-NEXT: .LBB2_12:
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: or a1, a2, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rol a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a6, a2, 63
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: sll a7, a0, a6
+; RV32XTHEADBB-NEXT: j .LBB2_3
+; RV32XTHEADBB-NEXT: .LBB2_2:
+; RV32XTHEADBB-NEXT: sll a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a5, a6
+; RV32XTHEADBB-NEXT: srl a5, a0, a5
+; RV32XTHEADBB-NEXT: sll a7, a1, a2
+; RV32XTHEADBB-NEXT: or a7, a5, a7
+; RV32XTHEADBB-NEXT: .LBB2_3:
+; RV32XTHEADBB-NEXT: neg a5, a2
+; RV32XTHEADBB-NEXT: mv a2, a1
+; RV32XTHEADBB-NEXT: beqz a6, .LBB2_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a2, a7
+; RV32XTHEADBB-NEXT: .LBB2_5:
+; RV32XTHEADBB-NEXT: andi a6, a5, 63
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl a7, a1, a6
+; RV32XTHEADBB-NEXT: bnez a6, .LBB2_8
+; RV32XTHEADBB-NEXT: j .LBB2_9
+; RV32XTHEADBB-NEXT: .LBB2_7:
+; RV32XTHEADBB-NEXT: srl a7, a0, a5
+; RV32XTHEADBB-NEXT: neg t0, a6
+; RV32XTHEADBB-NEXT: sll t0, a1, t0
+; RV32XTHEADBB-NEXT: or a7, a7, t0
+; RV32XTHEADBB-NEXT: beqz a6, .LBB2_9
+; RV32XTHEADBB-NEXT: .LBB2_8:
+; RV32XTHEADBB-NEXT: mv a0, a7
+; RV32XTHEADBB-NEXT: .LBB2_9:
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_11
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: j .LBB2_12
+; RV32XTHEADBB-NEXT: .LBB2_11:
+; RV32XTHEADBB-NEXT: srl a1, a1, a5
+; RV32XTHEADBB-NEXT: .LBB2_12:
+; RV32XTHEADBB-NEXT: or a0, a3, a0
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: sll a1, a0, a1
+; RV64XTHEADBB-NEXT: srl a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i64 64, %y
+ %b = shl i64 %x, %y
+ %c = lshr i64 %x, %z
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i64 @rotr_64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotr_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a5, a2, 63
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a5, a4, .LBB3_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a6, a1, a5
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: bnez a5, .LBB3_3
+; RV32I-NEXT: j .LBB3_4
+; RV32I-NEXT: .LBB3_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: neg a6, a5
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a6, a3, a6
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a5, .LBB3_4
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: bltu a5, a4, .LBB3_7
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: andi a5, a6, 63
+; RV32I-NEXT: bgeu a5, a4, .LBB3_8
+; RV32I-NEXT: .LBB3_6:
+; RV32I-NEXT: sll a4, a0, a6
+; RV32I-NEXT: neg a7, a5
+; RV32I-NEXT: srl a0, a0, a7
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: bnez a5, .LBB3_9
+; RV32I-NEXT: j .LBB3_10
+; RV32I-NEXT: .LBB3_7:
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: andi a5, a6, 63
+; RV32I-NEXT: bltu a5, a4, .LBB3_6
+; RV32I-NEXT: .LBB3_8:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: sll a0, a0, a5
+; RV32I-NEXT: beqz a5, .LBB3_10
+; RV32I-NEXT: .LBB3_9:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB3_10:
+; RV32I-NEXT: or a0, a3, a4
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srl a1, a0, a1
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a5, a2, 63
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: bltu a5, a4, .LBB3_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a6, a1, a5
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: bnez a5, .LBB3_3
+; RV32ZBB-NEXT: j .LBB3_4
+; RV32ZBB-NEXT: .LBB3_2:
+; RV32ZBB-NEXT: srl a3, a0, a2
+; RV32ZBB-NEXT: neg a6, a5
+; RV32ZBB-NEXT: sll a6, a1, a6
+; RV32ZBB-NEXT: or a6, a3, a6
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: beqz a5, .LBB3_4
+; RV32ZBB-NEXT: .LBB3_3:
+; RV32ZBB-NEXT: mv a3, a6
+; RV32ZBB-NEXT: .LBB3_4:
+; RV32ZBB-NEXT: neg a6, a2
+; RV32ZBB-NEXT: bltu a5, a4, .LBB3_7
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li a2, 0
+; RV32ZBB-NEXT: andi a5, a6, 63
+; RV32ZBB-NEXT: bgeu a5, a4, .LBB3_8
+; RV32ZBB-NEXT: .LBB3_6:
+; RV32ZBB-NEXT: sll a4, a0, a6
+; RV32ZBB-NEXT: neg a7, a5
+; RV32ZBB-NEXT: srl a0, a0, a7
+; RV32ZBB-NEXT: sll a6, a1, a6
+; RV32ZBB-NEXT: or a0, a0, a6
+; RV32ZBB-NEXT: bnez a5, .LBB3_9
+; RV32ZBB-NEXT: j .LBB3_10
+; RV32ZBB-NEXT: .LBB3_7:
+; RV32ZBB-NEXT: srl a2, a1, a2
+; RV32ZBB-NEXT: andi a5, a6, 63
+; RV32ZBB-NEXT: bltu a5, a4, .LBB3_6
+; RV32ZBB-NEXT: .LBB3_8:
+; RV32ZBB-NEXT: li a4, 0
+; RV32ZBB-NEXT: sll a0, a0, a5
+; RV32ZBB-NEXT: beqz a5, .LBB3_10
+; RV32ZBB-NEXT: .LBB3_9:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB3_10:
+; RV32ZBB-NEXT: or a0, a3, a4
+; RV32ZBB-NEXT: or a1, a2, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ror a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a5, a2, 63
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a6, a1, a5
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: bnez a5, .LBB3_3
+; RV32XTHEADBB-NEXT: j .LBB3_4
+; RV32XTHEADBB-NEXT: .LBB3_2:
+; RV32XTHEADBB-NEXT: srl a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a6, a5
+; RV32XTHEADBB-NEXT: sll a6, a1, a6
+; RV32XTHEADBB-NEXT: or a6, a3, a6
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: beqz a5, .LBB3_4
+; RV32XTHEADBB-NEXT: .LBB3_3:
+; RV32XTHEADBB-NEXT: mv a3, a6
+; RV32XTHEADBB-NEXT: .LBB3_4:
+; RV32XTHEADBB-NEXT: neg a6, a2
+; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_7
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li a2, 0
+; RV32XTHEADBB-NEXT: andi a5, a6, 63
+; RV32XTHEADBB-NEXT: bgeu a5, a4, .LBB3_8
+; RV32XTHEADBB-NEXT: .LBB3_6:
+; RV32XTHEADBB-NEXT: sll a4, a0, a6
+; RV32XTHEADBB-NEXT: neg a7, a5
+; RV32XTHEADBB-NEXT: srl a0, a0, a7
+; RV32XTHEADBB-NEXT: sll a6, a1, a6
+; RV32XTHEADBB-NEXT: or a0, a0, a6
+; RV32XTHEADBB-NEXT: bnez a5, .LBB3_9
+; RV32XTHEADBB-NEXT: j .LBB3_10
+; RV32XTHEADBB-NEXT: .LBB3_7:
+; RV32XTHEADBB-NEXT: srl a2, a1, a2
+; RV32XTHEADBB-NEXT: andi a5, a6, 63
+; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_6
+; RV32XTHEADBB-NEXT: .LBB3_8:
+; RV32XTHEADBB-NEXT: li a4, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, a5
+; RV32XTHEADBB-NEXT: beqz a5, .LBB3_10
+; RV32XTHEADBB-NEXT: .LBB3_9:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB3_10:
+; RV32XTHEADBB-NEXT: or a0, a3, a4
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: srl a1, a0, a1
+; RV64XTHEADBB-NEXT: sll a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i64 64, %y
+ %b = lshr i64 %x, %y
+ %c = shl i64 %x, %z
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_32_mask:
+; RV32I: # %bb.0:
+; RV32I-NEXT: neg a2, a1
+; RV32I-NEXT: sll a1, a0, a1
+; RV32I-NEXT: srl a0, a0, a2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: sllw a1, a0, a1
+; RV64I-NEXT: srlw a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: neg a2, a1
+; RV32ZBB-NEXT: sll a1, a0, a1
+; RV32ZBB-NEXT: srl a0, a0, a2
+; RV32ZBB-NEXT: or a0, a1, a0
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: sllw a1, a0, a1
+; RV64ZBB-NEXT: srlw a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: neg a2, a1
+; RV32XTHEADBB-NEXT: sll a1, a0, a1
+; RV32XTHEADBB-NEXT: srl a0, a0, a2
+; RV32XTHEADBB-NEXT: or a0, a1, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: sllw a1, a0, a1
+; RV64XTHEADBB-NEXT: srlw a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 0, %y
+ %and = and i32 %z, 31
+ %b = shl i32 %x, %y
+ %c = lshr i32 %x, %and
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_32_mask_and_63_and_31:
+; RV32I: # %bb.0:
+; RV32I-NEXT: sll a2, a0, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask_and_63_and_31:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sllw a2, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: srlw a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask_and_63_and_31:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: sll a2, a0, a1
+; RV32ZBB-NEXT: neg a1, a1
+; RV32ZBB-NEXT: srl a0, a0, a1
+; RV32ZBB-NEXT: or a0, a2, a0
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: sllw a2, a0, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: srlw a0, a0, a1
+; RV64ZBB-NEXT: or a0, a2, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: sll a2, a0, a1
+; RV32XTHEADBB-NEXT: neg a1, a1
+; RV32XTHEADBB-NEXT: srl a0, a0, a1
+; RV32XTHEADBB-NEXT: or a0, a2, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: sllw a2, a0, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: srlw a0, a0, a1
+; RV64XTHEADBB-NEXT: or a0, a2, a0
+; RV64XTHEADBB-NEXT: ret
+ %a = and i32 %y, 63
+ %b = shl i32 %x, %a
+ %c = sub i32 0, %y
+ %d = and i32 %c, 31
+ %e = lshr i32 %x, %d
+ %f = or i32 %b, %e
+ ret i32 %f
+}
+
+define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_32_mask_or_64_or_32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: ori a1, a1, 64
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask_or_64_or_32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: ori a1, a1, 64
+; RV64I-NEXT: sllw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask_or_64_or_32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ori a1, a1, 64
+; RV32ZBB-NEXT: sll a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask_or_64_or_32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ori a1, a1, 64
+; RV64ZBB-NEXT: sllw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask_or_64_or_32:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: ori a1, a1, 64
+; RV32XTHEADBB-NEXT: sll a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask_or_64_or_32:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: ori a1, a1, 64
+; RV64XTHEADBB-NEXT: sllw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %a = or i32 %y, 64
+ %b = shl i32 %x, %a
+ %c = sub i32 0, %y
+ %d = or i32 %c, 32
+ %e = lshr i32 %x, %d
+ %f = or i32 %b, %e
+ ret i32 %f
+}
+
+define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_32_mask:
+; RV32I: # %bb.0:
+; RV32I-NEXT: neg a2, a1
+; RV32I-NEXT: srl a1, a0, a1
+; RV32I-NEXT: sll a0, a0, a2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srlw a1, a0, a1
+; RV64I-NEXT: sllw a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: neg a2, a1
+; RV32ZBB-NEXT: srl a1, a0, a1
+; RV32ZBB-NEXT: sll a0, a0, a2
+; RV32ZBB-NEXT: or a0, a1, a0
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: srlw a1, a0, a1
+; RV64ZBB-NEXT: sllw a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: neg a2, a1
+; RV32XTHEADBB-NEXT: srl a1, a0, a1
+; RV32XTHEADBB-NEXT: sll a0, a0, a2
+; RV32XTHEADBB-NEXT: or a0, a1, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: srlw a1, a0, a1
+; RV64XTHEADBB-NEXT: sllw a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 0, %y
+ %and = and i32 %z, 31
+ %b = lshr i32 %x, %y
+ %c = shl i32 %x, %and
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_32_mask_and_63_and_31:
+; RV32I: # %bb.0:
+; RV32I-NEXT: srl a2, a0, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask_and_63_and_31:
+; RV64I: # %bb.0:
+; RV64I-NEXT: srlw a2, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: sllw a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask_and_63_and_31:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: srl a2, a0, a1
+; RV32ZBB-NEXT: neg a1, a1
+; RV32ZBB-NEXT: sll a0, a0, a1
+; RV32ZBB-NEXT: or a0, a2, a0
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: srlw a2, a0, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: sllw a0, a0, a1
+; RV64ZBB-NEXT: or a0, a2, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: srl a2, a0, a1
+; RV32XTHEADBB-NEXT: neg a1, a1
+; RV32XTHEADBB-NEXT: sll a0, a0, a1
+; RV32XTHEADBB-NEXT: or a0, a2, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: srlw a2, a0, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: sllw a0, a0, a1
+; RV64XTHEADBB-NEXT: or a0, a2, a0
+; RV64XTHEADBB-NEXT: ret
+ %a = and i32 %y, 63
+ %b = lshr i32 %x, %a
+ %c = sub i32 0, %y
+ %d = and i32 %c, 31
+ %e = shl i32 %x, %d
+ %f = or i32 %b, %e
+ ret i32 %f
+}
+
+define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_32_mask_or_64_or_32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: ori a1, a1, 64
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask_or_64_or_32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: ori a1, a1, 64
+; RV64I-NEXT: srlw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask_or_64_or_32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ori a1, a1, 64
+; RV32ZBB-NEXT: srl a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask_or_64_or_32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ori a1, a1, 64
+; RV64ZBB-NEXT: srlw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask_or_64_or_32:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: ori a1, a1, 64
+; RV32XTHEADBB-NEXT: srl a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask_or_64_or_32:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: ori a1, a1, 64
+; RV64XTHEADBB-NEXT: srlw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %a = or i32 %y, 64
+ %b = lshr i32 %x, %a
+ %c = sub i32 0, %y
+ %d = or i32 %c, 32
+ %e = shl i32 %x, %d
+ %f = or i32 %b, %e
+ ret i32 %f
+}
+
+define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotl_64_mask:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a5, 32
+; RV32I-NEXT: neg a4, a2
+; RV32I-NEXT: bltu a2, a5, .LBB10_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: sll t0, a0, a2
+; RV32I-NEXT: j .LBB10_3
+; RV32I-NEXT: .LBB10_2:
+; RV32I-NEXT: sll a3, a0, a2
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: srl a6, a0, a6
+; RV32I-NEXT: sll a7, a1, a2
+; RV32I-NEXT: or t0, a6, a7
+; RV32I-NEXT: .LBB10_3:
+; RV32I-NEXT: andi a7, a4, 63
+; RV32I-NEXT: mv a6, a1
+; RV32I-NEXT: beqz a2, .LBB10_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a6, t0
+; RV32I-NEXT: .LBB10_5:
+; RV32I-NEXT: bltu a7, a5, .LBB10_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl a2, a1, a7
+; RV32I-NEXT: bnez a7, .LBB10_8
+; RV32I-NEXT: j .LBB10_9
+; RV32I-NEXT: .LBB10_7:
+; RV32I-NEXT: srl a2, a0, a4
+; RV32I-NEXT: neg t0, a7
+; RV32I-NEXT: sll t0, a1, t0
+; RV32I-NEXT: or a2, a2, t0
+; RV32I-NEXT: beqz a7, .LBB10_9
+; RV32I-NEXT: .LBB10_8:
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: .LBB10_9:
+; RV32I-NEXT: bltu a7, a5, .LBB10_11
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB10_12
+; RV32I-NEXT: .LBB10_11:
+; RV32I-NEXT: srl a1, a1, a4
+; RV32I-NEXT: .LBB10_12:
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: sll a1, a0, a1
+; RV64I-NEXT: srl a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: li a5, 32
+; RV32ZBB-NEXT: neg a4, a2
+; RV32ZBB-NEXT: bltu a2, a5, .LBB10_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: sll t0, a0, a2
+; RV32ZBB-NEXT: j .LBB10_3
+; RV32ZBB-NEXT: .LBB10_2:
+; RV32ZBB-NEXT: sll a3, a0, a2
+; RV32ZBB-NEXT: neg a6, a2
+; RV32ZBB-NEXT: srl a6, a0, a6
+; RV32ZBB-NEXT: sll a7, a1, a2
+; RV32ZBB-NEXT: or t0, a6, a7
+; RV32ZBB-NEXT: .LBB10_3:
+; RV32ZBB-NEXT: andi a7, a4, 63
+; RV32ZBB-NEXT: mv a6, a1
+; RV32ZBB-NEXT: beqz a2, .LBB10_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a6, t0
+; RV32ZBB-NEXT: .LBB10_5:
+; RV32ZBB-NEXT: bltu a7, a5, .LBB10_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl a2, a1, a7
+; RV32ZBB-NEXT: bnez a7, .LBB10_8
+; RV32ZBB-NEXT: j .LBB10_9
+; RV32ZBB-NEXT: .LBB10_7:
+; RV32ZBB-NEXT: srl a2, a0, a4
+; RV32ZBB-NEXT: neg t0, a7
+; RV32ZBB-NEXT: sll t0, a1, t0
+; RV32ZBB-NEXT: or a2, a2, t0
+; RV32ZBB-NEXT: beqz a7, .LBB10_9
+; RV32ZBB-NEXT: .LBB10_8:
+; RV32ZBB-NEXT: mv a0, a2
+; RV32ZBB-NEXT: .LBB10_9:
+; RV32ZBB-NEXT: bltu a7, a5, .LBB10_11
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: j .LBB10_12
+; RV32ZBB-NEXT: .LBB10_11:
+; RV32ZBB-NEXT: srl a1, a1, a4
+; RV32ZBB-NEXT: .LBB10_12:
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: or a1, a6, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: sll a1, a0, a1
+; RV64ZBB-NEXT: srl a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: li a5, 32
+; RV32XTHEADBB-NEXT: neg a4, a2
+; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB10_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: sll t0, a0, a2
+; RV32XTHEADBB-NEXT: j .LBB10_3
+; RV32XTHEADBB-NEXT: .LBB10_2:
+; RV32XTHEADBB-NEXT: sll a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a6, a2
+; RV32XTHEADBB-NEXT: srl a6, a0, a6
+; RV32XTHEADBB-NEXT: sll a7, a1, a2
+; RV32XTHEADBB-NEXT: or t0, a6, a7
+; RV32XTHEADBB-NEXT: .LBB10_3:
+; RV32XTHEADBB-NEXT: andi a7, a4, 63
+; RV32XTHEADBB-NEXT: mv a6, a1
+; RV32XTHEADBB-NEXT: beqz a2, .LBB10_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a6, t0
+; RV32XTHEADBB-NEXT: .LBB10_5:
+; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB10_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl a2, a1, a7
+; RV32XTHEADBB-NEXT: bnez a7, .LBB10_8
+; RV32XTHEADBB-NEXT: j .LBB10_9
+; RV32XTHEADBB-NEXT: .LBB10_7:
+; RV32XTHEADBB-NEXT: srl a2, a0, a4
+; RV32XTHEADBB-NEXT: neg t0, a7
+; RV32XTHEADBB-NEXT: sll t0, a1, t0
+; RV32XTHEADBB-NEXT: or a2, a2, t0
+; RV32XTHEADBB-NEXT: beqz a7, .LBB10_9
+; RV32XTHEADBB-NEXT: .LBB10_8:
+; RV32XTHEADBB-NEXT: mv a0, a2
+; RV32XTHEADBB-NEXT: .LBB10_9:
+; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB10_11
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: j .LBB10_12
+; RV32XTHEADBB-NEXT: .LBB10_11:
+; RV32XTHEADBB-NEXT: srl a1, a1, a4
+; RV32XTHEADBB-NEXT: .LBB10_12:
+; RV32XTHEADBB-NEXT: or a0, a3, a0
+; RV32XTHEADBB-NEXT: or a1, a6, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: sll a1, a0, a1
+; RV64XTHEADBB-NEXT: srl a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i64 0, %y
+ %and = and i64 %z, 63
+ %b = shl i64 %x, %y
+ %c = lshr i64 %x, %and
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotl_64_mask_and_127_and_63:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a6, a2, 127
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a6, a4, .LBB11_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: sll a7, a0, a6
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: bnez a6, .LBB11_3
+; RV32I-NEXT: j .LBB11_4
+; RV32I-NEXT: .LBB11_2:
+; RV32I-NEXT: sll a3, a0, a2
+; RV32I-NEXT: neg a5, a6
+; RV32I-NEXT: srl a5, a0, a5
+; RV32I-NEXT: sll a7, a1, a2
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: beqz a6, .LBB11_4
+; RV32I-NEXT: .LBB11_3:
+; RV32I-NEXT: mv a5, a7
+; RV32I-NEXT: .LBB11_4:
+; RV32I-NEXT: neg a2, a2
+; RV32I-NEXT: andi a6, a2, 63
+; RV32I-NEXT: bltu a6, a4, .LBB11_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srl a7, a1, a6
+; RV32I-NEXT: bnez a6, .LBB11_7
+; RV32I-NEXT: j .LBB11_8
+; RV32I-NEXT: .LBB11_6:
+; RV32I-NEXT: srl a7, a0, a2
+; RV32I-NEXT: neg t0, a6
+; RV32I-NEXT: sll t0, a1, t0
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: beqz a6, .LBB11_8
+; RV32I-NEXT: .LBB11_7:
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: .LBB11_8:
+; RV32I-NEXT: bltu a6, a4, .LBB11_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB11_11
+; RV32I-NEXT: .LBB11_10:
+; RV32I-NEXT: srl a1, a1, a2
+; RV32I-NEXT: .LBB11_11:
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a1, a5, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask_and_127_and_63:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sll a2, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask_and_127_and_63:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a6, a2, 127
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: bltu a6, a4, .LBB11_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: sll a7, a0, a6
+; RV32ZBB-NEXT: mv a5, a1
+; RV32ZBB-NEXT: bnez a6, .LBB11_3
+; RV32ZBB-NEXT: j .LBB11_4
+; RV32ZBB-NEXT: .LBB11_2:
+; RV32ZBB-NEXT: sll a3, a0, a2
+; RV32ZBB-NEXT: neg a5, a6
+; RV32ZBB-NEXT: srl a5, a0, a5
+; RV32ZBB-NEXT: sll a7, a1, a2
+; RV32ZBB-NEXT: or a7, a5, a7
+; RV32ZBB-NEXT: mv a5, a1
+; RV32ZBB-NEXT: beqz a6, .LBB11_4
+; RV32ZBB-NEXT: .LBB11_3:
+; RV32ZBB-NEXT: mv a5, a7
+; RV32ZBB-NEXT: .LBB11_4:
+; RV32ZBB-NEXT: neg a2, a2
+; RV32ZBB-NEXT: andi a6, a2, 63
+; RV32ZBB-NEXT: bltu a6, a4, .LBB11_6
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: srl a7, a1, a6
+; RV32ZBB-NEXT: bnez a6, .LBB11_7
+; RV32ZBB-NEXT: j .LBB11_8
+; RV32ZBB-NEXT: .LBB11_6:
+; RV32ZBB-NEXT: srl a7, a0, a2
+; RV32ZBB-NEXT: neg t0, a6
+; RV32ZBB-NEXT: sll t0, a1, t0
+; RV32ZBB-NEXT: or a7, a7, t0
+; RV32ZBB-NEXT: beqz a6, .LBB11_8
+; RV32ZBB-NEXT: .LBB11_7:
+; RV32ZBB-NEXT: mv a0, a7
+; RV32ZBB-NEXT: .LBB11_8:
+; RV32ZBB-NEXT: bltu a6, a4, .LBB11_10
+; RV32ZBB-NEXT: # %bb.9:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: j .LBB11_11
+; RV32ZBB-NEXT: .LBB11_10:
+; RV32ZBB-NEXT: srl a1, a1, a2
+; RV32ZBB-NEXT: .LBB11_11:
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: or a1, a5, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: sll a2, a0, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: srl a0, a0, a1
+; RV64ZBB-NEXT: or a0, a2, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a6, a2, 127
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: sll a7, a0, a6
+; RV32XTHEADBB-NEXT: mv a5, a1
+; RV32XTHEADBB-NEXT: bnez a6, .LBB11_3
+; RV32XTHEADBB-NEXT: j .LBB11_4
+; RV32XTHEADBB-NEXT: .LBB11_2:
+; RV32XTHEADBB-NEXT: sll a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a5, a6
+; RV32XTHEADBB-NEXT: srl a5, a0, a5
+; RV32XTHEADBB-NEXT: sll a7, a1, a2
+; RV32XTHEADBB-NEXT: or a7, a5, a7
+; RV32XTHEADBB-NEXT: mv a5, a1
+; RV32XTHEADBB-NEXT: beqz a6, .LBB11_4
+; RV32XTHEADBB-NEXT: .LBB11_3:
+; RV32XTHEADBB-NEXT: mv a5, a7
+; RV32XTHEADBB-NEXT: .LBB11_4:
+; RV32XTHEADBB-NEXT: neg a2, a2
+; RV32XTHEADBB-NEXT: andi a6, a2, 63
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_6
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: srl a7, a1, a6
+; RV32XTHEADBB-NEXT: bnez a6, .LBB11_7
+; RV32XTHEADBB-NEXT: j .LBB11_8
+; RV32XTHEADBB-NEXT: .LBB11_6:
+; RV32XTHEADBB-NEXT: srl a7, a0, a2
+; RV32XTHEADBB-NEXT: neg t0, a6
+; RV32XTHEADBB-NEXT: sll t0, a1, t0
+; RV32XTHEADBB-NEXT: or a7, a7, t0
+; RV32XTHEADBB-NEXT: beqz a6, .LBB11_8
+; RV32XTHEADBB-NEXT: .LBB11_7:
+; RV32XTHEADBB-NEXT: mv a0, a7
+; RV32XTHEADBB-NEXT: .LBB11_8:
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_10
+; RV32XTHEADBB-NEXT: # %bb.9:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: j .LBB11_11
+; RV32XTHEADBB-NEXT: .LBB11_10:
+; RV32XTHEADBB-NEXT: srl a1, a1, a2
+; RV32XTHEADBB-NEXT: .LBB11_11:
+; RV32XTHEADBB-NEXT: or a0, a3, a0
+; RV32XTHEADBB-NEXT: or a1, a5, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: sll a2, a0, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: srl a0, a0, a1
+; RV64XTHEADBB-NEXT: or a0, a2, a0
+; RV64XTHEADBB-NEXT: ret
+ %a = and i64 %y, 127
+ %b = shl i64 %x, %a
+ %c = sub i64 0, %y
+ %d = and i64 %c, 63
+ %e = lshr i64 %x, %d
+ %f = or i64 %b, %e
+ ret i64 %f
+}
+
+define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotl_64_mask_or_128_or_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: ori a2, a2, 128
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: bltu a2, a0, .LBB12_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: bnez a2, .LBB12_3
+; RV32I-NEXT: j .LBB12_4
+; RV32I-NEXT: .LBB12_2:
+; RV32I-NEXT: sll a0, a3, a2
+; RV32I-NEXT: neg a4, a2
+; RV32I-NEXT: srl a3, a3, a4
+; RV32I-NEXT: sll a4, a1, a2
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: beqz a2, .LBB12_4
+; RV32I-NEXT: .LBB12_3:
+; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: .LBB12_4:
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask_or_128_or_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: ori a1, a1, 128
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask_or_128_or_64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: ori a2, a2, 128
+; RV32ZBB-NEXT: li a0, 32
+; RV32ZBB-NEXT: bltu a2, a0, .LBB12_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a0, 0
+; RV32ZBB-NEXT: sll a3, a3, a2
+; RV32ZBB-NEXT: bnez a2, .LBB12_3
+; RV32ZBB-NEXT: j .LBB12_4
+; RV32ZBB-NEXT: .LBB12_2:
+; RV32ZBB-NEXT: sll a0, a3, a2
+; RV32ZBB-NEXT: neg a4, a2
+; RV32ZBB-NEXT: srl a3, a3, a4
+; RV32ZBB-NEXT: sll a4, a1, a2
+; RV32ZBB-NEXT: or a3, a3, a4
+; RV32ZBB-NEXT: beqz a2, .LBB12_4
+; RV32ZBB-NEXT: .LBB12_3:
+; RV32ZBB-NEXT: mv a1, a3
+; RV32ZBB-NEXT: .LBB12_4:
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask_or_128_or_64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ori a1, a1, 128
+; RV64ZBB-NEXT: sll a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask_or_128_or_64:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: ori a2, a2, 128
+; RV32XTHEADBB-NEXT: li a0, 32
+; RV32XTHEADBB-NEXT: bltu a2, a0, .LBB12_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a0, 0
+; RV32XTHEADBB-NEXT: sll a3, a3, a2
+; RV32XTHEADBB-NEXT: bnez a2, .LBB12_3
+; RV32XTHEADBB-NEXT: j .LBB12_4
+; RV32XTHEADBB-NEXT: .LBB12_2:
+; RV32XTHEADBB-NEXT: sll a0, a3, a2
+; RV32XTHEADBB-NEXT: neg a4, a2
+; RV32XTHEADBB-NEXT: srl a3, a3, a4
+; RV32XTHEADBB-NEXT: sll a4, a1, a2
+; RV32XTHEADBB-NEXT: or a3, a3, a4
+; RV32XTHEADBB-NEXT: beqz a2, .LBB12_4
+; RV32XTHEADBB-NEXT: .LBB12_3:
+; RV32XTHEADBB-NEXT: mv a1, a3
+; RV32XTHEADBB-NEXT: .LBB12_4:
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask_or_128_or_64:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: ori a1, a1, 128
+; RV64XTHEADBB-NEXT: sll a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %a = or i64 %y, 128
+ %b = shl i64 %x, %a
+ %c = sub i64 0, %y
+ %d = or i64 %c, 64
+ %e = lshr i64 %x, %d
+ %f = or i64 %b, %e
+ ret i64 %f
+}
+
+define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotr_64_mask:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a2, a4, .LBB13_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a5, a1, a2
+; RV32I-NEXT: j .LBB13_3
+; RV32I-NEXT: .LBB13_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a5, a3, a5
+; RV32I-NEXT: .LBB13_3:
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a2, .LBB13_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: .LBB13_5:
+; RV32I-NEXT: andi a5, a6, 63
+; RV32I-NEXT: bltu a2, a4, .LBB13_8
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: bgeu a5, a4, .LBB13_9
+; RV32I-NEXT: .LBB13_7:
+; RV32I-NEXT: sll a4, a0, a6
+; RV32I-NEXT: neg a7, a5
+; RV32I-NEXT: srl a0, a0, a7
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: bnez a5, .LBB13_10
+; RV32I-NEXT: j .LBB13_11
+; RV32I-NEXT: .LBB13_8:
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: bltu a5, a4, .LBB13_7
+; RV32I-NEXT: .LBB13_9:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: sll a0, a0, a5
+; RV32I-NEXT: beqz a5, .LBB13_11
+; RV32I-NEXT: .LBB13_10:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB13_11:
+; RV32I-NEXT: or a0, a3, a4
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srl a1, a0, a1
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: bltu a2, a4, .LBB13_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a5, a1, a2
+; RV32ZBB-NEXT: j .LBB13_3
+; RV32ZBB-NEXT: .LBB13_2:
+; RV32ZBB-NEXT: srl a3, a0, a2
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: sll a5, a1, a5
+; RV32ZBB-NEXT: or a5, a3, a5
+; RV32ZBB-NEXT: .LBB13_3:
+; RV32ZBB-NEXT: neg a6, a2
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: beqz a2, .LBB13_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a3, a5
+; RV32ZBB-NEXT: .LBB13_5:
+; RV32ZBB-NEXT: andi a5, a6, 63
+; RV32ZBB-NEXT: bltu a2, a4, .LBB13_8
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: li a2, 0
+; RV32ZBB-NEXT: bgeu a5, a4, .LBB13_9
+; RV32ZBB-NEXT: .LBB13_7:
+; RV32ZBB-NEXT: sll a4, a0, a6
+; RV32ZBB-NEXT: neg a7, a5
+; RV32ZBB-NEXT: srl a0, a0, a7
+; RV32ZBB-NEXT: sll a6, a1, a6
+; RV32ZBB-NEXT: or a0, a0, a6
+; RV32ZBB-NEXT: bnez a5, .LBB13_10
+; RV32ZBB-NEXT: j .LBB13_11
+; RV32ZBB-NEXT: .LBB13_8:
+; RV32ZBB-NEXT: srl a2, a1, a2
+; RV32ZBB-NEXT: bltu a5, a4, .LBB13_7
+; RV32ZBB-NEXT: .LBB13_9:
+; RV32ZBB-NEXT: li a4, 0
+; RV32ZBB-NEXT: sll a0, a0, a5
+; RV32ZBB-NEXT: beqz a5, .LBB13_11
+; RV32ZBB-NEXT: .LBB13_10:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB13_11:
+; RV32ZBB-NEXT: or a0, a3, a4
+; RV32ZBB-NEXT: or a1, a2, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: srl a1, a0, a1
+; RV64ZBB-NEXT: sll a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB13_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a5, a1, a2
+; RV32XTHEADBB-NEXT: j .LBB13_3
+; RV32XTHEADBB-NEXT: .LBB13_2:
+; RV32XTHEADBB-NEXT: srl a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a5, a2
+; RV32XTHEADBB-NEXT: sll a5, a1, a5
+; RV32XTHEADBB-NEXT: or a5, a3, a5
+; RV32XTHEADBB-NEXT: .LBB13_3:
+; RV32XTHEADBB-NEXT: neg a6, a2
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: beqz a2, .LBB13_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a3, a5
+; RV32XTHEADBB-NEXT: .LBB13_5:
+; RV32XTHEADBB-NEXT: andi a5, a6, 63
+; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB13_8
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: li a2, 0
+; RV32XTHEADBB-NEXT: bgeu a5, a4, .LBB13_9
+; RV32XTHEADBB-NEXT: .LBB13_7:
+; RV32XTHEADBB-NEXT: sll a4, a0, a6
+; RV32XTHEADBB-NEXT: neg a7, a5
+; RV32XTHEADBB-NEXT: srl a0, a0, a7
+; RV32XTHEADBB-NEXT: sll a6, a1, a6
+; RV32XTHEADBB-NEXT: or a0, a0, a6
+; RV32XTHEADBB-NEXT: bnez a5, .LBB13_10
+; RV32XTHEADBB-NEXT: j .LBB13_11
+; RV32XTHEADBB-NEXT: .LBB13_8:
+; RV32XTHEADBB-NEXT: srl a2, a1, a2
+; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB13_7
+; RV32XTHEADBB-NEXT: .LBB13_9:
+; RV32XTHEADBB-NEXT: li a4, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, a5
+; RV32XTHEADBB-NEXT: beqz a5, .LBB13_11
+; RV32XTHEADBB-NEXT: .LBB13_10:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB13_11:
+; RV32XTHEADBB-NEXT: or a0, a3, a4
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: srl a1, a0, a1
+; RV64XTHEADBB-NEXT: sll a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i64 0, %y
+ %and = and i64 %z, 63
+ %b = lshr i64 %x, %y
+ %c = shl i64 %x, %and
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotr_64_mask_and_127_and_63:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a4, a2, 127
+; RV32I-NEXT: li a5, 32
+; RV32I-NEXT: bltu a4, a5, .LBB14_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a6, a1, a4
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: bnez a4, .LBB14_3
+; RV32I-NEXT: j .LBB14_4
+; RV32I-NEXT: .LBB14_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: neg a6, a4
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a6, a3, a6
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a4, .LBB14_4
+; RV32I-NEXT: .LBB14_3:
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: .LBB14_4:
+; RV32I-NEXT: bltu a4, a5, .LBB14_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: j .LBB14_7
+; RV32I-NEXT: .LBB14_6:
+; RV32I-NEXT: srl a4, a1, a2
+; RV32I-NEXT: .LBB14_7:
+; RV32I-NEXT: neg a7, a2
+; RV32I-NEXT: andi a6, a7, 63
+; RV32I-NEXT: bltu a6, a5, .LBB14_9
+; RV32I-NEXT: # %bb.8:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: sll a0, a0, a6
+; RV32I-NEXT: bnez a6, .LBB14_10
+; RV32I-NEXT: j .LBB14_11
+; RV32I-NEXT: .LBB14_9:
+; RV32I-NEXT: sll a2, a0, a7
+; RV32I-NEXT: neg a5, a6
+; RV32I-NEXT: srl a0, a0, a5
+; RV32I-NEXT: sll a5, a1, a7
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: beqz a6, .LBB14_11
+; RV32I-NEXT: .LBB14_10:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB14_11:
+; RV32I-NEXT: or a0, a3, a2
+; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask_and_127_and_63:
+; RV64I: # %bb.0:
+; RV64I-NEXT: srl a2, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask_and_127_and_63:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a4, a2, 127
+; RV32ZBB-NEXT: li a5, 32
+; RV32ZBB-NEXT: bltu a4, a5, .LBB14_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a6, a1, a4
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: bnez a4, .LBB14_3
+; RV32ZBB-NEXT: j .LBB14_4
+; RV32ZBB-NEXT: .LBB14_2:
+; RV32ZBB-NEXT: srl a3, a0, a2
+; RV32ZBB-NEXT: neg a6, a4
+; RV32ZBB-NEXT: sll a6, a1, a6
+; RV32ZBB-NEXT: or a6, a3, a6
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: beqz a4, .LBB14_4
+; RV32ZBB-NEXT: .LBB14_3:
+; RV32ZBB-NEXT: mv a3, a6
+; RV32ZBB-NEXT: .LBB14_4:
+; RV32ZBB-NEXT: bltu a4, a5, .LBB14_6
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li a4, 0
+; RV32ZBB-NEXT: j .LBB14_7
+; RV32ZBB-NEXT: .LBB14_6:
+; RV32ZBB-NEXT: srl a4, a1, a2
+; RV32ZBB-NEXT: .LBB14_7:
+; RV32ZBB-NEXT: neg a7, a2
+; RV32ZBB-NEXT: andi a6, a7, 63
+; RV32ZBB-NEXT: bltu a6, a5, .LBB14_9
+; RV32ZBB-NEXT: # %bb.8:
+; RV32ZBB-NEXT: li a2, 0
+; RV32ZBB-NEXT: sll a0, a0, a6
+; RV32ZBB-NEXT: bnez a6, .LBB14_10
+; RV32ZBB-NEXT: j .LBB14_11
+; RV32ZBB-NEXT: .LBB14_9:
+; RV32ZBB-NEXT: sll a2, a0, a7
+; RV32ZBB-NEXT: neg a5, a6
+; RV32ZBB-NEXT: srl a0, a0, a5
+; RV32ZBB-NEXT: sll a5, a1, a7
+; RV32ZBB-NEXT: or a0, a0, a5
+; RV32ZBB-NEXT: beqz a6, .LBB14_11
+; RV32ZBB-NEXT: .LBB14_10:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB14_11:
+; RV32ZBB-NEXT: or a0, a3, a2
+; RV32ZBB-NEXT: or a1, a4, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: srl a2, a0, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: sll a0, a0, a1
+; RV64ZBB-NEXT: or a0, a2, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a4, a2, 127
+; RV32XTHEADBB-NEXT: li a5, 32
+; RV32XTHEADBB-NEXT: bltu a4, a5, .LBB14_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a6, a1, a4
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: bnez a4, .LBB14_3
+; RV32XTHEADBB-NEXT: j .LBB14_4
+; RV32XTHEADBB-NEXT: .LBB14_2:
+; RV32XTHEADBB-NEXT: srl a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a6, a4
+; RV32XTHEADBB-NEXT: sll a6, a1, a6
+; RV32XTHEADBB-NEXT: or a6, a3, a6
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: beqz a4, .LBB14_4
+; RV32XTHEADBB-NEXT: .LBB14_3:
+; RV32XTHEADBB-NEXT: mv a3, a6
+; RV32XTHEADBB-NEXT: .LBB14_4:
+; RV32XTHEADBB-NEXT: bltu a4, a5, .LBB14_6
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li a4, 0
+; RV32XTHEADBB-NEXT: j .LBB14_7
+; RV32XTHEADBB-NEXT: .LBB14_6:
+; RV32XTHEADBB-NEXT: srl a4, a1, a2
+; RV32XTHEADBB-NEXT: .LBB14_7:
+; RV32XTHEADBB-NEXT: neg a7, a2
+; RV32XTHEADBB-NEXT: andi a6, a7, 63
+; RV32XTHEADBB-NEXT: bltu a6, a5, .LBB14_9
+; RV32XTHEADBB-NEXT: # %bb.8:
+; RV32XTHEADBB-NEXT: li a2, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, a6
+; RV32XTHEADBB-NEXT: bnez a6, .LBB14_10
+; RV32XTHEADBB-NEXT: j .LBB14_11
+; RV32XTHEADBB-NEXT: .LBB14_9:
+; RV32XTHEADBB-NEXT: sll a2, a0, a7
+; RV32XTHEADBB-NEXT: neg a5, a6
+; RV32XTHEADBB-NEXT: srl a0, a0, a5
+; RV32XTHEADBB-NEXT: sll a5, a1, a7
+; RV32XTHEADBB-NEXT: or a0, a0, a5
+; RV32XTHEADBB-NEXT: beqz a6, .LBB14_11
+; RV32XTHEADBB-NEXT: .LBB14_10:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB14_11:
+; RV32XTHEADBB-NEXT: or a0, a3, a2
+; RV32XTHEADBB-NEXT: or a1, a4, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: srl a2, a0, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: sll a0, a0, a1
+; RV64XTHEADBB-NEXT: or a0, a2, a0
+; RV64XTHEADBB-NEXT: ret
+ %a = and i64 %y, 127
+ %b = lshr i64 %x, %a
+ %c = sub i64 0, %y
+ %d = and i64 %c, 63
+ %e = shl i64 %x, %d
+ %f = or i64 %b, %e
+ ret i64 %f
+}
+
+define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotr_64_mask_or_128_or_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: ori a2, a2, 128
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB15_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB15_3
+; RV32I-NEXT: j .LBB15_4
+; RV32I-NEXT: .LBB15_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB15_4
+; RV32I-NEXT: .LBB15_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB15_4:
+; RV32I-NEXT: bltu a2, a3, .LBB15_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB15_6:
+; RV32I-NEXT: srl a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask_or_128_or_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: ori a1, a1, 128
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask_or_128_or_64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ori a2, a2, 128
+; RV32ZBB-NEXT: li a3, 32
+; RV32ZBB-NEXT: bltu a2, a3, .LBB15_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a4, a1, a2
+; RV32ZBB-NEXT: bnez a2, .LBB15_3
+; RV32ZBB-NEXT: j .LBB15_4
+; RV32ZBB-NEXT: .LBB15_2:
+; RV32ZBB-NEXT: srl a4, a0, a2
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: sll a5, a1, a5
+; RV32ZBB-NEXT: or a4, a4, a5
+; RV32ZBB-NEXT: beqz a2, .LBB15_4
+; RV32ZBB-NEXT: .LBB15_3:
+; RV32ZBB-NEXT: mv a0, a4
+; RV32ZBB-NEXT: .LBB15_4:
+; RV32ZBB-NEXT: bltu a2, a3, .LBB15_6
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: ret
+; RV32ZBB-NEXT: .LBB15_6:
+; RV32ZBB-NEXT: srl a1, a1, a2
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask_or_128_or_64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ori a1, a1, 128
+; RV64ZBB-NEXT: srl a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask_or_128_or_64:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: ori a2, a2, 128
+; RV32XTHEADBB-NEXT: li a3, 32
+; RV32XTHEADBB-NEXT: bltu a2, a3, .LBB15_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a4, a1, a2
+; RV32XTHEADBB-NEXT: bnez a2, .LBB15_3
+; RV32XTHEADBB-NEXT: j .LBB15_4
+; RV32XTHEADBB-NEXT: .LBB15_2:
+; RV32XTHEADBB-NEXT: srl a4, a0, a2
+; RV32XTHEADBB-NEXT: neg a5, a2
+; RV32XTHEADBB-NEXT: sll a5, a1, a5
+; RV32XTHEADBB-NEXT: or a4, a4, a5
+; RV32XTHEADBB-NEXT: beqz a2, .LBB15_4
+; RV32XTHEADBB-NEXT: .LBB15_3:
+; RV32XTHEADBB-NEXT: mv a0, a4
+; RV32XTHEADBB-NEXT: .LBB15_4:
+; RV32XTHEADBB-NEXT: bltu a2, a3, .LBB15_6
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: ret
+; RV32XTHEADBB-NEXT: .LBB15_6:
+; RV32XTHEADBB-NEXT: srl a1, a1, a2
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask_or_128_or_64:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: ori a1, a1, 128
+; RV64XTHEADBB-NEXT: srl a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %a = or i64 %y, 128
+ %b = lshr i64 %x, %a
+ %c = sub i64 0, %y
+ %d = or i64 %c, 64
+ %e = shl i64 %x, %d
+ %f = or i64 %b, %e
+ ret i64 %f
+}
+
+; Test that we're able to remove a mask on the rotate amount that has more than
+; one use.
+define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind {
+; RV32I-LABEL: rotl_32_mask_shared:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a3, a2, 31
+; RV32I-NEXT: sll a4, a0, a2
+; RV32I-NEXT: neg a3, a3
+; RV32I-NEXT: srl a0, a0, a3
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: sll a1, a1, a2
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask_shared:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 31
+; RV64I-NEXT: sllw a4, a0, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: srlw a0, a0, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: sllw a1, a1, a2
+; RV64I-NEXT: addw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask_shared:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: rol a0, a0, a2
+; RV32ZBB-NEXT: sll a1, a1, a2
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask_shared:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rolw a0, a0, a2
+; RV64ZBB-NEXT: sllw a1, a1, a2
+; RV64ZBB-NEXT: addw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask_shared:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a3, a2, 31
+; RV32XTHEADBB-NEXT: sll a4, a0, a2
+; RV32XTHEADBB-NEXT: neg a3, a3
+; RV32XTHEADBB-NEXT: srl a0, a0, a3
+; RV32XTHEADBB-NEXT: or a0, a4, a0
+; RV32XTHEADBB-NEXT: sll a1, a1, a2
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask_shared:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 31
+; RV64XTHEADBB-NEXT: sllw a4, a0, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: srlw a0, a0, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: sllw a1, a1, a2
+; RV64XTHEADBB-NEXT: addw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i32 %amt, 31
+ %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %maskedamt)
+ %2 = shl i32 %b, %maskedamt
+ %3 = add i32 %1, %2
+ ret i32 %3
+}
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+
+define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind {
+; RV32I-LABEL: rotl_64_mask_shared:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a5, a4, 63
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: neg a7, a5
+; RV32I-NEXT: bltu a5, t0, .LBB17_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll t3, a0, a5
+; RV32I-NEXT: j .LBB17_3
+; RV32I-NEXT: .LBB17_2:
+; RV32I-NEXT: sll a6, a0, a4
+; RV32I-NEXT: srl t1, a0, a7
+; RV32I-NEXT: sll t2, a1, a4
+; RV32I-NEXT: or t3, t1, t2
+; RV32I-NEXT: .LBB17_3:
+; RV32I-NEXT: neg t2, a5
+; RV32I-NEXT: mv t1, a1
+; RV32I-NEXT: beqz a5, .LBB17_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t1, t3
+; RV32I-NEXT: .LBB17_5:
+; RV32I-NEXT: andi t3, t2, 63
+; RV32I-NEXT: bltu t3, t0, .LBB17_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t4, a1, t3
+; RV32I-NEXT: bnez t3, .LBB17_8
+; RV32I-NEXT: j .LBB17_9
+; RV32I-NEXT: .LBB17_7:
+; RV32I-NEXT: srl t4, a0, t2
+; RV32I-NEXT: neg t5, t3
+; RV32I-NEXT: sll t5, a1, t5
+; RV32I-NEXT: or t4, t4, t5
+; RV32I-NEXT: beqz t3, .LBB17_9
+; RV32I-NEXT: .LBB17_8:
+; RV32I-NEXT: mv a0, t4
+; RV32I-NEXT: .LBB17_9:
+; RV32I-NEXT: bltu t3, t0, .LBB17_12
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: bgeu a5, t0, .LBB17_13
+; RV32I-NEXT: .LBB17_11:
+; RV32I-NEXT: sll t0, a2, a4
+; RV32I-NEXT: srl a2, a2, a7
+; RV32I-NEXT: sll a4, a3, a4
+; RV32I-NEXT: or a2, a2, a4
+; RV32I-NEXT: j .LBB17_14
+; RV32I-NEXT: .LBB17_12:
+; RV32I-NEXT: srl a1, a1, t2
+; RV32I-NEXT: bltu a5, t0, .LBB17_11
+; RV32I-NEXT: .LBB17_13:
+; RV32I-NEXT: li t0, 0
+; RV32I-NEXT: sll a2, a2, a5
+; RV32I-NEXT: .LBB17_14:
+; RV32I-NEXT: or a0, a6, a0
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: beqz a5, .LBB17_16
+; RV32I-NEXT: # %bb.15:
+; RV32I-NEXT: mv a3, a2
+; RV32I-NEXT: .LBB17_16:
+; RV32I-NEXT: add a0, a0, t0
+; RV32I-NEXT: sltu a2, a0, t0
+; RV32I-NEXT: add a1, a1, a3
+; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask_shared:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 63
+; RV64I-NEXT: sll a4, a0, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: srl a0, a0, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: sll a1, a1, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask_shared:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a5, a4, 63
+; RV32ZBB-NEXT: li t0, 32
+; RV32ZBB-NEXT: neg a7, a5
+; RV32ZBB-NEXT: bltu a5, t0, .LBB17_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a6, 0
+; RV32ZBB-NEXT: sll t3, a0, a5
+; RV32ZBB-NEXT: j .LBB17_3
+; RV32ZBB-NEXT: .LBB17_2:
+; RV32ZBB-NEXT: sll a6, a0, a4
+; RV32ZBB-NEXT: srl t1, a0, a7
+; RV32ZBB-NEXT: sll t2, a1, a4
+; RV32ZBB-NEXT: or t3, t1, t2
+; RV32ZBB-NEXT: .LBB17_3:
+; RV32ZBB-NEXT: neg t2, a5
+; RV32ZBB-NEXT: mv t1, a1
+; RV32ZBB-NEXT: beqz a5, .LBB17_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv t1, t3
+; RV32ZBB-NEXT: .LBB17_5:
+; RV32ZBB-NEXT: andi t3, t2, 63
+; RV32ZBB-NEXT: bltu t3, t0, .LBB17_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl t4, a1, t3
+; RV32ZBB-NEXT: bnez t3, .LBB17_8
+; RV32ZBB-NEXT: j .LBB17_9
+; RV32ZBB-NEXT: .LBB17_7:
+; RV32ZBB-NEXT: srl t4, a0, t2
+; RV32ZBB-NEXT: neg t5, t3
+; RV32ZBB-NEXT: sll t5, a1, t5
+; RV32ZBB-NEXT: or t4, t4, t5
+; RV32ZBB-NEXT: beqz t3, .LBB17_9
+; RV32ZBB-NEXT: .LBB17_8:
+; RV32ZBB-NEXT: mv a0, t4
+; RV32ZBB-NEXT: .LBB17_9:
+; RV32ZBB-NEXT: bltu t3, t0, .LBB17_12
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: bgeu a5, t0, .LBB17_13
+; RV32ZBB-NEXT: .LBB17_11:
+; RV32ZBB-NEXT: sll t0, a2, a4
+; RV32ZBB-NEXT: srl a2, a2, a7
+; RV32ZBB-NEXT: sll a4, a3, a4
+; RV32ZBB-NEXT: or a2, a2, a4
+; RV32ZBB-NEXT: j .LBB17_14
+; RV32ZBB-NEXT: .LBB17_12:
+; RV32ZBB-NEXT: srl a1, a1, t2
+; RV32ZBB-NEXT: bltu a5, t0, .LBB17_11
+; RV32ZBB-NEXT: .LBB17_13:
+; RV32ZBB-NEXT: li t0, 0
+; RV32ZBB-NEXT: sll a2, a2, a5
+; RV32ZBB-NEXT: .LBB17_14:
+; RV32ZBB-NEXT: or a0, a6, a0
+; RV32ZBB-NEXT: or a1, t1, a1
+; RV32ZBB-NEXT: beqz a5, .LBB17_16
+; RV32ZBB-NEXT: # %bb.15:
+; RV32ZBB-NEXT: mv a3, a2
+; RV32ZBB-NEXT: .LBB17_16:
+; RV32ZBB-NEXT: add a0, a0, t0
+; RV32ZBB-NEXT: sltu a2, a0, t0
+; RV32ZBB-NEXT: add a1, a1, a3
+; RV32ZBB-NEXT: add a1, a1, a2
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask_shared:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rol a0, a0, a2
+; RV64ZBB-NEXT: sll a1, a1, a2
+; RV64ZBB-NEXT: add a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask_shared:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a5, a4, 63
+; RV32XTHEADBB-NEXT: li t0, 32
+; RV32XTHEADBB-NEXT: neg a7, a5
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB17_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a6, 0
+; RV32XTHEADBB-NEXT: sll t3, a0, a5
+; RV32XTHEADBB-NEXT: j .LBB17_3
+; RV32XTHEADBB-NEXT: .LBB17_2:
+; RV32XTHEADBB-NEXT: sll a6, a0, a4
+; RV32XTHEADBB-NEXT: srl t1, a0, a7
+; RV32XTHEADBB-NEXT: sll t2, a1, a4
+; RV32XTHEADBB-NEXT: or t3, t1, t2
+; RV32XTHEADBB-NEXT: .LBB17_3:
+; RV32XTHEADBB-NEXT: neg t2, a5
+; RV32XTHEADBB-NEXT: mv t1, a1
+; RV32XTHEADBB-NEXT: beqz a5, .LBB17_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv t1, t3
+; RV32XTHEADBB-NEXT: .LBB17_5:
+; RV32XTHEADBB-NEXT: andi t3, t2, 63
+; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB17_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl t4, a1, t3
+; RV32XTHEADBB-NEXT: bnez t3, .LBB17_8
+; RV32XTHEADBB-NEXT: j .LBB17_9
+; RV32XTHEADBB-NEXT: .LBB17_7:
+; RV32XTHEADBB-NEXT: srl t4, a0, t2
+; RV32XTHEADBB-NEXT: neg t5, t3
+; RV32XTHEADBB-NEXT: sll t5, a1, t5
+; RV32XTHEADBB-NEXT: or t4, t4, t5
+; RV32XTHEADBB-NEXT: beqz t3, .LBB17_9
+; RV32XTHEADBB-NEXT: .LBB17_8:
+; RV32XTHEADBB-NEXT: mv a0, t4
+; RV32XTHEADBB-NEXT: .LBB17_9:
+; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB17_12
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: bgeu a5, t0, .LBB17_13
+; RV32XTHEADBB-NEXT: .LBB17_11:
+; RV32XTHEADBB-NEXT: sll t0, a2, a4
+; RV32XTHEADBB-NEXT: srl a2, a2, a7
+; RV32XTHEADBB-NEXT: sll a4, a3, a4
+; RV32XTHEADBB-NEXT: or a2, a2, a4
+; RV32XTHEADBB-NEXT: j .LBB17_14
+; RV32XTHEADBB-NEXT: .LBB17_12:
+; RV32XTHEADBB-NEXT: srl a1, a1, t2
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB17_11
+; RV32XTHEADBB-NEXT: .LBB17_13:
+; RV32XTHEADBB-NEXT: li t0, 0
+; RV32XTHEADBB-NEXT: sll a2, a2, a5
+; RV32XTHEADBB-NEXT: .LBB17_14:
+; RV32XTHEADBB-NEXT: or a0, a6, a0
+; RV32XTHEADBB-NEXT: or a1, t1, a1
+; RV32XTHEADBB-NEXT: beqz a5, .LBB17_16
+; RV32XTHEADBB-NEXT: # %bb.15:
+; RV32XTHEADBB-NEXT: mv a3, a2
+; RV32XTHEADBB-NEXT: .LBB17_16:
+; RV32XTHEADBB-NEXT: add a0, a0, t0
+; RV32XTHEADBB-NEXT: sltu a2, a0, t0
+; RV32XTHEADBB-NEXT: add a1, a1, a3
+; RV32XTHEADBB-NEXT: add a1, a1, a2
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask_shared:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 63
+; RV64XTHEADBB-NEXT: sll a4, a0, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: srl a0, a0, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: sll a1, a1, a2
+; RV64XTHEADBB-NEXT: add a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i64 %amt, 63
+ %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %maskedamt)
+ %2 = shl i64 %b, %maskedamt
+ %3 = add i64 %1, %2
+ ret i64 %3
+}
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+
+define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind {
+; RV32I-LABEL: rotr_32_mask_shared:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a3, a2, 31
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a3, a3
+; RV32I-NEXT: sll a0, a0, a3
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: sll a1, a1, a2
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask_shared:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 31
+; RV64I-NEXT: srlw a4, a0, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: sllw a0, a0, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: sllw a1, a1, a2
+; RV64I-NEXT: addw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask_shared:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ror a0, a0, a2
+; RV32ZBB-NEXT: sll a1, a1, a2
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask_shared:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rorw a0, a0, a2
+; RV64ZBB-NEXT: sllw a1, a1, a2
+; RV64ZBB-NEXT: addw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask_shared:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a3, a2, 31
+; RV32XTHEADBB-NEXT: srl a4, a0, a2
+; RV32XTHEADBB-NEXT: neg a3, a3
+; RV32XTHEADBB-NEXT: sll a0, a0, a3
+; RV32XTHEADBB-NEXT: or a0, a4, a0
+; RV32XTHEADBB-NEXT: sll a1, a1, a2
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask_shared:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 31
+; RV64XTHEADBB-NEXT: srlw a4, a0, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: sllw a0, a0, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: sllw a1, a1, a2
+; RV64XTHEADBB-NEXT: addw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i32 %amt, 31
+ %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %maskedamt)
+ %2 = shl i32 %b, %maskedamt
+ %3 = add i32 %1, %2
+ ret i32 %3
+}
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind {
+; RV32I-LABEL: rotr_64_mask_shared:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a5, a4, 63
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: neg a6, a5
+; RV32I-NEXT: bltu a5, t0, .LBB19_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl t1, a1, a5
+; RV32I-NEXT: mv a7, a0
+; RV32I-NEXT: bnez a5, .LBB19_3
+; RV32I-NEXT: j .LBB19_4
+; RV32I-NEXT: .LBB19_2:
+; RV32I-NEXT: srl a7, a0, a4
+; RV32I-NEXT: sll t1, a1, a6
+; RV32I-NEXT: or t1, a7, t1
+; RV32I-NEXT: mv a7, a0
+; RV32I-NEXT: beqz a5, .LBB19_4
+; RV32I-NEXT: .LBB19_3:
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: .LBB19_4:
+; RV32I-NEXT: neg t4, a5
+; RV32I-NEXT: bltu a5, t0, .LBB19_7
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li t1, 0
+; RV32I-NEXT: andi t3, t4, 63
+; RV32I-NEXT: bgeu t3, t0, .LBB19_8
+; RV32I-NEXT: .LBB19_6:
+; RV32I-NEXT: sll t2, a0, t4
+; RV32I-NEXT: neg t5, t3
+; RV32I-NEXT: srl a0, a0, t5
+; RV32I-NEXT: sll t4, a1, t4
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: bnez t3, .LBB19_9
+; RV32I-NEXT: j .LBB19_10
+; RV32I-NEXT: .LBB19_7:
+; RV32I-NEXT: srl t1, a1, a4
+; RV32I-NEXT: andi t3, t4, 63
+; RV32I-NEXT: bltu t3, t0, .LBB19_6
+; RV32I-NEXT: .LBB19_8:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: sll a0, a0, t3
+; RV32I-NEXT: beqz t3, .LBB19_10
+; RV32I-NEXT: .LBB19_9:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB19_10:
+; RV32I-NEXT: bltu a5, t0, .LBB19_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t0, 0
+; RV32I-NEXT: sll a0, a2, a5
+; RV32I-NEXT: j .LBB19_13
+; RV32I-NEXT: .LBB19_12:
+; RV32I-NEXT: sll t0, a2, a4
+; RV32I-NEXT: srl a0, a2, a6
+; RV32I-NEXT: sll a2, a3, a4
+; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: .LBB19_13:
+; RV32I-NEXT: or a2, a7, t2
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: beqz a5, .LBB19_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: .LBB19_15:
+; RV32I-NEXT: add a0, a2, t0
+; RV32I-NEXT: sltu a2, a0, t0
+; RV32I-NEXT: add a1, a1, a3
+; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask_shared:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 63
+; RV64I-NEXT: srl a4, a0, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: sll a0, a0, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: sll a1, a1, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask_shared:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a5, a4, 63
+; RV32ZBB-NEXT: li t0, 32
+; RV32ZBB-NEXT: neg a6, a5
+; RV32ZBB-NEXT: bltu a5, t0, .LBB19_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl t1, a1, a5
+; RV32ZBB-NEXT: mv a7, a0
+; RV32ZBB-NEXT: bnez a5, .LBB19_3
+; RV32ZBB-NEXT: j .LBB19_4
+; RV32ZBB-NEXT: .LBB19_2:
+; RV32ZBB-NEXT: srl a7, a0, a4
+; RV32ZBB-NEXT: sll t1, a1, a6
+; RV32ZBB-NEXT: or t1, a7, t1
+; RV32ZBB-NEXT: mv a7, a0
+; RV32ZBB-NEXT: beqz a5, .LBB19_4
+; RV32ZBB-NEXT: .LBB19_3:
+; RV32ZBB-NEXT: mv a7, t1
+; RV32ZBB-NEXT: .LBB19_4:
+; RV32ZBB-NEXT: neg t4, a5
+; RV32ZBB-NEXT: bltu a5, t0, .LBB19_7
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li t1, 0
+; RV32ZBB-NEXT: andi t3, t4, 63
+; RV32ZBB-NEXT: bgeu t3, t0, .LBB19_8
+; RV32ZBB-NEXT: .LBB19_6:
+; RV32ZBB-NEXT: sll t2, a0, t4
+; RV32ZBB-NEXT: neg t5, t3
+; RV32ZBB-NEXT: srl a0, a0, t5
+; RV32ZBB-NEXT: sll t4, a1, t4
+; RV32ZBB-NEXT: or a0, a0, t4
+; RV32ZBB-NEXT: bnez t3, .LBB19_9
+; RV32ZBB-NEXT: j .LBB19_10
+; RV32ZBB-NEXT: .LBB19_7:
+; RV32ZBB-NEXT: srl t1, a1, a4
+; RV32ZBB-NEXT: andi t3, t4, 63
+; RV32ZBB-NEXT: bltu t3, t0, .LBB19_6
+; RV32ZBB-NEXT: .LBB19_8:
+; RV32ZBB-NEXT: li t2, 0
+; RV32ZBB-NEXT: sll a0, a0, t3
+; RV32ZBB-NEXT: beqz t3, .LBB19_10
+; RV32ZBB-NEXT: .LBB19_9:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB19_10:
+; RV32ZBB-NEXT: bltu a5, t0, .LBB19_12
+; RV32ZBB-NEXT: # %bb.11:
+; RV32ZBB-NEXT: li t0, 0
+; RV32ZBB-NEXT: sll a0, a2, a5
+; RV32ZBB-NEXT: j .LBB19_13
+; RV32ZBB-NEXT: .LBB19_12:
+; RV32ZBB-NEXT: sll t0, a2, a4
+; RV32ZBB-NEXT: srl a0, a2, a6
+; RV32ZBB-NEXT: sll a2, a3, a4
+; RV32ZBB-NEXT: or a0, a0, a2
+; RV32ZBB-NEXT: .LBB19_13:
+; RV32ZBB-NEXT: or a2, a7, t2
+; RV32ZBB-NEXT: or a1, t1, a1
+; RV32ZBB-NEXT: beqz a5, .LBB19_15
+; RV32ZBB-NEXT: # %bb.14:
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: .LBB19_15:
+; RV32ZBB-NEXT: add a0, a2, t0
+; RV32ZBB-NEXT: sltu a2, a0, t0
+; RV32ZBB-NEXT: add a1, a1, a3
+; RV32ZBB-NEXT: add a1, a1, a2
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask_shared:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ror a0, a0, a2
+; RV64ZBB-NEXT: sll a1, a1, a2
+; RV64ZBB-NEXT: add a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask_shared:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a5, a4, 63
+; RV32XTHEADBB-NEXT: li t0, 32
+; RV32XTHEADBB-NEXT: neg a6, a5
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl t1, a1, a5
+; RV32XTHEADBB-NEXT: mv a7, a0
+; RV32XTHEADBB-NEXT: bnez a5, .LBB19_3
+; RV32XTHEADBB-NEXT: j .LBB19_4
+; RV32XTHEADBB-NEXT: .LBB19_2:
+; RV32XTHEADBB-NEXT: srl a7, a0, a4
+; RV32XTHEADBB-NEXT: sll t1, a1, a6
+; RV32XTHEADBB-NEXT: or t1, a7, t1
+; RV32XTHEADBB-NEXT: mv a7, a0
+; RV32XTHEADBB-NEXT: beqz a5, .LBB19_4
+; RV32XTHEADBB-NEXT: .LBB19_3:
+; RV32XTHEADBB-NEXT: mv a7, t1
+; RV32XTHEADBB-NEXT: .LBB19_4:
+; RV32XTHEADBB-NEXT: neg t4, a5
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_7
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li t1, 0
+; RV32XTHEADBB-NEXT: andi t3, t4, 63
+; RV32XTHEADBB-NEXT: bgeu t3, t0, .LBB19_8
+; RV32XTHEADBB-NEXT: .LBB19_6:
+; RV32XTHEADBB-NEXT: sll t2, a0, t4
+; RV32XTHEADBB-NEXT: neg t5, t3
+; RV32XTHEADBB-NEXT: srl a0, a0, t5
+; RV32XTHEADBB-NEXT: sll t4, a1, t4
+; RV32XTHEADBB-NEXT: or a0, a0, t4
+; RV32XTHEADBB-NEXT: bnez t3, .LBB19_9
+; RV32XTHEADBB-NEXT: j .LBB19_10
+; RV32XTHEADBB-NEXT: .LBB19_7:
+; RV32XTHEADBB-NEXT: srl t1, a1, a4
+; RV32XTHEADBB-NEXT: andi t3, t4, 63
+; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB19_6
+; RV32XTHEADBB-NEXT: .LBB19_8:
+; RV32XTHEADBB-NEXT: li t2, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, t3
+; RV32XTHEADBB-NEXT: beqz t3, .LBB19_10
+; RV32XTHEADBB-NEXT: .LBB19_9:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB19_10:
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_12
+; RV32XTHEADBB-NEXT: # %bb.11:
+; RV32XTHEADBB-NEXT: li t0, 0
+; RV32XTHEADBB-NEXT: sll a0, a2, a5
+; RV32XTHEADBB-NEXT: j .LBB19_13
+; RV32XTHEADBB-NEXT: .LBB19_12:
+; RV32XTHEADBB-NEXT: sll t0, a2, a4
+; RV32XTHEADBB-NEXT: srl a0, a2, a6
+; RV32XTHEADBB-NEXT: sll a2, a3, a4
+; RV32XTHEADBB-NEXT: or a0, a0, a2
+; RV32XTHEADBB-NEXT: .LBB19_13:
+; RV32XTHEADBB-NEXT: or a2, a7, t2
+; RV32XTHEADBB-NEXT: or a1, t1, a1
+; RV32XTHEADBB-NEXT: beqz a5, .LBB19_15
+; RV32XTHEADBB-NEXT: # %bb.14:
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: .LBB19_15:
+; RV32XTHEADBB-NEXT: add a0, a2, t0
+; RV32XTHEADBB-NEXT: sltu a2, a0, t0
+; RV32XTHEADBB-NEXT: add a1, a1, a3
+; RV32XTHEADBB-NEXT: add a1, a1, a2
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask_shared:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 63
+; RV64XTHEADBB-NEXT: srl a4, a0, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: sll a0, a0, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: sll a1, a1, a2
+; RV64XTHEADBB-NEXT: add a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i64 %amt, 63
+ %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %maskedamt)
+ %2 = shl i64 %b, %maskedamt
+ %3 = add i64 %1, %2
+ ret i64 %3
+}
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind {
+; RV32I-LABEL: rotl_32_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a3, a2, 31
+; RV32I-NEXT: sll a4, a0, a2
+; RV32I-NEXT: sll a2, a1, a2
+; RV32I-NEXT: neg a3, a3
+; RV32I-NEXT: srl a0, a0, a3
+; RV32I-NEXT: srl a1, a1, a3
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 31
+; RV64I-NEXT: sllw a4, a0, a2
+; RV64I-NEXT: sllw a2, a1, a2
+; RV64I-NEXT: neg a5, a3
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: srlw a0, a0, a5
+; RV64I-NEXT: srlw a1, a1, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: addw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask_multiple:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: rol a0, a0, a2
+; RV32ZBB-NEXT: rol a1, a1, a2
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask_multiple:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rolw a0, a0, a2
+; RV64ZBB-NEXT: rolw a1, a1, a2
+; RV64ZBB-NEXT: addw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask_multiple:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a3, a2, 31
+; RV32XTHEADBB-NEXT: sll a4, a0, a2
+; RV32XTHEADBB-NEXT: sll a2, a1, a2
+; RV32XTHEADBB-NEXT: neg a3, a3
+; RV32XTHEADBB-NEXT: srl a0, a0, a3
+; RV32XTHEADBB-NEXT: srl a1, a1, a3
+; RV32XTHEADBB-NEXT: or a0, a4, a0
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask_multiple:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 31
+; RV64XTHEADBB-NEXT: sllw a4, a0, a2
+; RV64XTHEADBB-NEXT: sllw a2, a1, a2
+; RV64XTHEADBB-NEXT: neg a5, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: srlw a0, a0, a5
+; RV64XTHEADBB-NEXT: srlw a1, a1, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: or a1, a2, a1
+; RV64XTHEADBB-NEXT: addw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i32 %amt, 31
+ %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %maskedamt)
+ %2 = tail call i32 @llvm.fshl.i32(i32 %b, i32 %b, i32 %maskedamt)
+ %3 = add i32 %1, %2
+ ret i32 %3
+}
+
+define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
+; RV32I-LABEL: rotl_64_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi t1, a4, 63
+; RV32I-NEXT: li a5, 32
+; RV32I-NEXT: neg t3, t1
+; RV32I-NEXT: bltu t1, a5, .LBB21_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll t2, a0, t1
+; RV32I-NEXT: j .LBB21_3
+; RV32I-NEXT: .LBB21_2:
+; RV32I-NEXT: sll a6, a0, a4
+; RV32I-NEXT: srl a7, a0, t3
+; RV32I-NEXT: sll t0, a1, a4
+; RV32I-NEXT: or t2, a7, t0
+; RV32I-NEXT: .LBB21_3:
+; RV32I-NEXT: neg a7, t1
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: beqz t1, .LBB21_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t0, t2
+; RV32I-NEXT: .LBB21_5:
+; RV32I-NEXT: andi t2, a7, 63
+; RV32I-NEXT: neg t4, t2
+; RV32I-NEXT: bltu t2, a5, .LBB21_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t5, a1, t2
+; RV32I-NEXT: bnez t2, .LBB21_8
+; RV32I-NEXT: j .LBB21_9
+; RV32I-NEXT: .LBB21_7:
+; RV32I-NEXT: srl t5, a0, a7
+; RV32I-NEXT: sll t6, a1, t4
+; RV32I-NEXT: or t5, t5, t6
+; RV32I-NEXT: beqz t2, .LBB21_9
+; RV32I-NEXT: .LBB21_8:
+; RV32I-NEXT: mv a0, t5
+; RV32I-NEXT: .LBB21_9:
+; RV32I-NEXT: bltu t2, a5, .LBB21_12
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: bgeu t1, a5, .LBB21_13
+; RV32I-NEXT: .LBB21_11:
+; RV32I-NEXT: sll a1, a2, a4
+; RV32I-NEXT: srl t3, a2, t3
+; RV32I-NEXT: sll a4, a3, a4
+; RV32I-NEXT: or t3, t3, a4
+; RV32I-NEXT: mv a4, a3
+; RV32I-NEXT: bnez t1, .LBB21_14
+; RV32I-NEXT: j .LBB21_15
+; RV32I-NEXT: .LBB21_12:
+; RV32I-NEXT: srl t5, a1, a7
+; RV32I-NEXT: bltu t1, a5, .LBB21_11
+; RV32I-NEXT: .LBB21_13:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sll t3, a2, t1
+; RV32I-NEXT: mv a4, a3
+; RV32I-NEXT: beqz t1, .LBB21_15
+; RV32I-NEXT: .LBB21_14:
+; RV32I-NEXT: mv a4, t3
+; RV32I-NEXT: .LBB21_15:
+; RV32I-NEXT: bltu t2, a5, .LBB21_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: srl t1, a3, t2
+; RV32I-NEXT: bnez t2, .LBB21_18
+; RV32I-NEXT: j .LBB21_19
+; RV32I-NEXT: .LBB21_17:
+; RV32I-NEXT: srl t1, a2, a7
+; RV32I-NEXT: sll t3, a3, t4
+; RV32I-NEXT: or t1, t1, t3
+; RV32I-NEXT: beqz t2, .LBB21_19
+; RV32I-NEXT: .LBB21_18:
+; RV32I-NEXT: mv a2, t1
+; RV32I-NEXT: .LBB21_19:
+; RV32I-NEXT: or a0, a6, a0
+; RV32I-NEXT: or a6, t0, t5
+; RV32I-NEXT: bltu t2, a5, .LBB21_21
+; RV32I-NEXT: # %bb.20:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: j .LBB21_22
+; RV32I-NEXT: .LBB21_21:
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: .LBB21_22:
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: sltu a1, a0, a1
+; RV32I-NEXT: add a3, a6, a3
+; RV32I-NEXT: add a1, a3, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 63
+; RV64I-NEXT: sll a4, a0, a2
+; RV64I-NEXT: sll a2, a1, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: srl a0, a0, a3
+; RV64I-NEXT: srl a1, a1, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask_multiple:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi t1, a4, 63
+; RV32ZBB-NEXT: li a5, 32
+; RV32ZBB-NEXT: neg t3, t1
+; RV32ZBB-NEXT: bltu t1, a5, .LBB21_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a6, 0
+; RV32ZBB-NEXT: sll t2, a0, t1
+; RV32ZBB-NEXT: j .LBB21_3
+; RV32ZBB-NEXT: .LBB21_2:
+; RV32ZBB-NEXT: sll a6, a0, a4
+; RV32ZBB-NEXT: srl a7, a0, t3
+; RV32ZBB-NEXT: sll t0, a1, a4
+; RV32ZBB-NEXT: or t2, a7, t0
+; RV32ZBB-NEXT: .LBB21_3:
+; RV32ZBB-NEXT: neg a7, t1
+; RV32ZBB-NEXT: mv t0, a1
+; RV32ZBB-NEXT: beqz t1, .LBB21_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv t0, t2
+; RV32ZBB-NEXT: .LBB21_5:
+; RV32ZBB-NEXT: andi t2, a7, 63
+; RV32ZBB-NEXT: neg t4, t2
+; RV32ZBB-NEXT: bltu t2, a5, .LBB21_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl t5, a1, t2
+; RV32ZBB-NEXT: bnez t2, .LBB21_8
+; RV32ZBB-NEXT: j .LBB21_9
+; RV32ZBB-NEXT: .LBB21_7:
+; RV32ZBB-NEXT: srl t5, a0, a7
+; RV32ZBB-NEXT: sll t6, a1, t4
+; RV32ZBB-NEXT: or t5, t5, t6
+; RV32ZBB-NEXT: beqz t2, .LBB21_9
+; RV32ZBB-NEXT: .LBB21_8:
+; RV32ZBB-NEXT: mv a0, t5
+; RV32ZBB-NEXT: .LBB21_9:
+; RV32ZBB-NEXT: bltu t2, a5, .LBB21_12
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li t5, 0
+; RV32ZBB-NEXT: bgeu t1, a5, .LBB21_13
+; RV32ZBB-NEXT: .LBB21_11:
+; RV32ZBB-NEXT: sll a1, a2, a4
+; RV32ZBB-NEXT: srl t3, a2, t3
+; RV32ZBB-NEXT: sll a4, a3, a4
+; RV32ZBB-NEXT: or t3, t3, a4
+; RV32ZBB-NEXT: mv a4, a3
+; RV32ZBB-NEXT: bnez t1, .LBB21_14
+; RV32ZBB-NEXT: j .LBB21_15
+; RV32ZBB-NEXT: .LBB21_12:
+; RV32ZBB-NEXT: srl t5, a1, a7
+; RV32ZBB-NEXT: bltu t1, a5, .LBB21_11
+; RV32ZBB-NEXT: .LBB21_13:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: sll t3, a2, t1
+; RV32ZBB-NEXT: mv a4, a3
+; RV32ZBB-NEXT: beqz t1, .LBB21_15
+; RV32ZBB-NEXT: .LBB21_14:
+; RV32ZBB-NEXT: mv a4, t3
+; RV32ZBB-NEXT: .LBB21_15:
+; RV32ZBB-NEXT: bltu t2, a5, .LBB21_17
+; RV32ZBB-NEXT: # %bb.16:
+; RV32ZBB-NEXT: srl t1, a3, t2
+; RV32ZBB-NEXT: bnez t2, .LBB21_18
+; RV32ZBB-NEXT: j .LBB21_19
+; RV32ZBB-NEXT: .LBB21_17:
+; RV32ZBB-NEXT: srl t1, a2, a7
+; RV32ZBB-NEXT: sll t3, a3, t4
+; RV32ZBB-NEXT: or t1, t1, t3
+; RV32ZBB-NEXT: beqz t2, .LBB21_19
+; RV32ZBB-NEXT: .LBB21_18:
+; RV32ZBB-NEXT: mv a2, t1
+; RV32ZBB-NEXT: .LBB21_19:
+; RV32ZBB-NEXT: or a0, a6, a0
+; RV32ZBB-NEXT: or a6, t0, t5
+; RV32ZBB-NEXT: bltu t2, a5, .LBB21_21
+; RV32ZBB-NEXT: # %bb.20:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: j .LBB21_22
+; RV32ZBB-NEXT: .LBB21_21:
+; RV32ZBB-NEXT: srl a3, a3, a7
+; RV32ZBB-NEXT: .LBB21_22:
+; RV32ZBB-NEXT: or a1, a1, a2
+; RV32ZBB-NEXT: or a3, a4, a3
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: sltu a1, a0, a1
+; RV32ZBB-NEXT: add a3, a6, a3
+; RV32ZBB-NEXT: add a1, a3, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask_multiple:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rol a0, a0, a2
+; RV64ZBB-NEXT: rol a1, a1, a2
+; RV64ZBB-NEXT: add a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask_multiple:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi t1, a4, 63
+; RV32XTHEADBB-NEXT: li a5, 32
+; RV32XTHEADBB-NEXT: neg t3, t1
+; RV32XTHEADBB-NEXT: bltu t1, a5, .LBB21_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a6, 0
+; RV32XTHEADBB-NEXT: sll t2, a0, t1
+; RV32XTHEADBB-NEXT: j .LBB21_3
+; RV32XTHEADBB-NEXT: .LBB21_2:
+; RV32XTHEADBB-NEXT: sll a6, a0, a4
+; RV32XTHEADBB-NEXT: srl a7, a0, t3
+; RV32XTHEADBB-NEXT: sll t0, a1, a4
+; RV32XTHEADBB-NEXT: or t2, a7, t0
+; RV32XTHEADBB-NEXT: .LBB21_3:
+; RV32XTHEADBB-NEXT: neg a7, t1
+; RV32XTHEADBB-NEXT: mv t0, a1
+; RV32XTHEADBB-NEXT: beqz t1, .LBB21_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv t0, t2
+; RV32XTHEADBB-NEXT: .LBB21_5:
+; RV32XTHEADBB-NEXT: andi t2, a7, 63
+; RV32XTHEADBB-NEXT: neg t4, t2
+; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl t5, a1, t2
+; RV32XTHEADBB-NEXT: bnez t2, .LBB21_8
+; RV32XTHEADBB-NEXT: j .LBB21_9
+; RV32XTHEADBB-NEXT: .LBB21_7:
+; RV32XTHEADBB-NEXT: srl t5, a0, a7
+; RV32XTHEADBB-NEXT: sll t6, a1, t4
+; RV32XTHEADBB-NEXT: or t5, t5, t6
+; RV32XTHEADBB-NEXT: beqz t2, .LBB21_9
+; RV32XTHEADBB-NEXT: .LBB21_8:
+; RV32XTHEADBB-NEXT: mv a0, t5
+; RV32XTHEADBB-NEXT: .LBB21_9:
+; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_12
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li t5, 0
+; RV32XTHEADBB-NEXT: bgeu t1, a5, .LBB21_13
+; RV32XTHEADBB-NEXT: .LBB21_11:
+; RV32XTHEADBB-NEXT: sll a1, a2, a4
+; RV32XTHEADBB-NEXT: srl t3, a2, t3
+; RV32XTHEADBB-NEXT: sll a4, a3, a4
+; RV32XTHEADBB-NEXT: or t3, t3, a4
+; RV32XTHEADBB-NEXT: mv a4, a3
+; RV32XTHEADBB-NEXT: bnez t1, .LBB21_14
+; RV32XTHEADBB-NEXT: j .LBB21_15
+; RV32XTHEADBB-NEXT: .LBB21_12:
+; RV32XTHEADBB-NEXT: srl t5, a1, a7
+; RV32XTHEADBB-NEXT: bltu t1, a5, .LBB21_11
+; RV32XTHEADBB-NEXT: .LBB21_13:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: sll t3, a2, t1
+; RV32XTHEADBB-NEXT: mv a4, a3
+; RV32XTHEADBB-NEXT: beqz t1, .LBB21_15
+; RV32XTHEADBB-NEXT: .LBB21_14:
+; RV32XTHEADBB-NEXT: mv a4, t3
+; RV32XTHEADBB-NEXT: .LBB21_15:
+; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_17
+; RV32XTHEADBB-NEXT: # %bb.16:
+; RV32XTHEADBB-NEXT: srl t1, a3, t2
+; RV32XTHEADBB-NEXT: bnez t2, .LBB21_18
+; RV32XTHEADBB-NEXT: j .LBB21_19
+; RV32XTHEADBB-NEXT: .LBB21_17:
+; RV32XTHEADBB-NEXT: srl t1, a2, a7
+; RV32XTHEADBB-NEXT: sll t3, a3, t4
+; RV32XTHEADBB-NEXT: or t1, t1, t3
+; RV32XTHEADBB-NEXT: beqz t2, .LBB21_19
+; RV32XTHEADBB-NEXT: .LBB21_18:
+; RV32XTHEADBB-NEXT: mv a2, t1
+; RV32XTHEADBB-NEXT: .LBB21_19:
+; RV32XTHEADBB-NEXT: or a0, a6, a0
+; RV32XTHEADBB-NEXT: or a6, t0, t5
+; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_21
+; RV32XTHEADBB-NEXT: # %bb.20:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: j .LBB21_22
+; RV32XTHEADBB-NEXT: .LBB21_21:
+; RV32XTHEADBB-NEXT: srl a3, a3, a7
+; RV32XTHEADBB-NEXT: .LBB21_22:
+; RV32XTHEADBB-NEXT: or a1, a1, a2
+; RV32XTHEADBB-NEXT: or a3, a4, a3
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: sltu a1, a0, a1
+; RV32XTHEADBB-NEXT: add a3, a6, a3
+; RV32XTHEADBB-NEXT: add a1, a3, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask_multiple:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 63
+; RV64XTHEADBB-NEXT: sll a4, a0, a2
+; RV64XTHEADBB-NEXT: sll a2, a1, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: srl a0, a0, a3
+; RV64XTHEADBB-NEXT: srl a1, a1, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: or a1, a2, a1
+; RV64XTHEADBB-NEXT: add a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i64 %amt, 63
+ %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %maskedamt)
+ %2 = tail call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 %maskedamt)
+ %3 = add i64 %1, %2
+ ret i64 %3
+}
+
+define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind {
+; RV32I-LABEL: rotr_32_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a3, a2, 31
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: neg a3, a3
+; RV32I-NEXT: sll a0, a0, a3
+; RV32I-NEXT: sll a1, a1, a3
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 31
+; RV64I-NEXT: srlw a4, a0, a2
+; RV64I-NEXT: srlw a2, a1, a2
+; RV64I-NEXT: neg a5, a3
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: sllw a0, a0, a5
+; RV64I-NEXT: sllw a1, a1, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: addw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask_multiple:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ror a0, a0, a2
+; RV32ZBB-NEXT: ror a1, a1, a2
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask_multiple:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rorw a0, a0, a2
+; RV64ZBB-NEXT: rorw a1, a1, a2
+; RV64ZBB-NEXT: addw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask_multiple:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a3, a2, 31
+; RV32XTHEADBB-NEXT: srl a4, a0, a2
+; RV32XTHEADBB-NEXT: srl a2, a1, a2
+; RV32XTHEADBB-NEXT: neg a3, a3
+; RV32XTHEADBB-NEXT: sll a0, a0, a3
+; RV32XTHEADBB-NEXT: sll a1, a1, a3
+; RV32XTHEADBB-NEXT: or a0, a4, a0
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask_multiple:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 31
+; RV64XTHEADBB-NEXT: srlw a4, a0, a2
+; RV64XTHEADBB-NEXT: srlw a2, a1, a2
+; RV64XTHEADBB-NEXT: neg a5, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: sllw a0, a0, a5
+; RV64XTHEADBB-NEXT: sllw a1, a1, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: or a1, a2, a1
+; RV64XTHEADBB-NEXT: addw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i32 %amt, 31
+ %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %maskedamt)
+ %2 = tail call i32 @llvm.fshr.i32(i32 %b, i32 %b, i32 %maskedamt)
+ %3 = add i32 %1, %2
+ ret i32 %3
+}
+
+define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
+; RV32I-LABEL: rotr_64_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi t0, a4, 63
+; RV32I-NEXT: li a6, 32
+; RV32I-NEXT: neg t4, t0
+; RV32I-NEXT: bltu t0, a6, .LBB23_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, a1, t0
+; RV32I-NEXT: mv a5, a0
+; RV32I-NEXT: bnez t0, .LBB23_3
+; RV32I-NEXT: j .LBB23_4
+; RV32I-NEXT: .LBB23_2:
+; RV32I-NEXT: srl a5, a0, a4
+; RV32I-NEXT: sll a7, a1, t4
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: mv a5, a0
+; RV32I-NEXT: beqz t0, .LBB23_4
+; RV32I-NEXT: .LBB23_3:
+; RV32I-NEXT: mv a5, a7
+; RV32I-NEXT: .LBB23_4:
+; RV32I-NEXT: neg t2, t0
+; RV32I-NEXT: bltu t0, a6, .LBB23_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: j .LBB23_7
+; RV32I-NEXT: .LBB23_6:
+; RV32I-NEXT: srl a7, a1, a4
+; RV32I-NEXT: .LBB23_7:
+; RV32I-NEXT: andi t1, t2, 63
+; RV32I-NEXT: neg t5, t1
+; RV32I-NEXT: bltu t1, a6, .LBB23_9
+; RV32I-NEXT: # %bb.8:
+; RV32I-NEXT: li t3, 0
+; RV32I-NEXT: sll a0, a0, t1
+; RV32I-NEXT: bnez t1, .LBB23_10
+; RV32I-NEXT: j .LBB23_11
+; RV32I-NEXT: .LBB23_9:
+; RV32I-NEXT: sll t3, a0, t2
+; RV32I-NEXT: srl a0, a0, t5
+; RV32I-NEXT: sll t6, a1, t2
+; RV32I-NEXT: or a0, a0, t6
+; RV32I-NEXT: beqz t1, .LBB23_11
+; RV32I-NEXT: .LBB23_10:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB23_11:
+; RV32I-NEXT: bltu t0, a6, .LBB23_13
+; RV32I-NEXT: # %bb.12:
+; RV32I-NEXT: srl t4, a3, t0
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: bnez t0, .LBB23_14
+; RV32I-NEXT: j .LBB23_15
+; RV32I-NEXT: .LBB23_13:
+; RV32I-NEXT: srl a0, a2, a4
+; RV32I-NEXT: sll t4, a3, t4
+; RV32I-NEXT: or t4, a0, t4
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: beqz t0, .LBB23_15
+; RV32I-NEXT: .LBB23_14:
+; RV32I-NEXT: mv a0, t4
+; RV32I-NEXT: .LBB23_15:
+; RV32I-NEXT: bltu t0, a6, .LBB23_18
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: bgeu t1, a6, .LBB23_19
+; RV32I-NEXT: .LBB23_17:
+; RV32I-NEXT: sll a6, a2, t2
+; RV32I-NEXT: srl a2, a2, t5
+; RV32I-NEXT: sll t0, a3, t2
+; RV32I-NEXT: or a2, a2, t0
+; RV32I-NEXT: j .LBB23_20
+; RV32I-NEXT: .LBB23_18:
+; RV32I-NEXT: srl a4, a3, a4
+; RV32I-NEXT: bltu t1, a6, .LBB23_17
+; RV32I-NEXT: .LBB23_19:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a2, a2, t1
+; RV32I-NEXT: .LBB23_20:
+; RV32I-NEXT: or a5, a5, t3
+; RV32I-NEXT: or a1, a7, a1
+; RV32I-NEXT: beqz t1, .LBB23_22
+; RV32I-NEXT: # %bb.21:
+; RV32I-NEXT: mv a3, a2
+; RV32I-NEXT: .LBB23_22:
+; RV32I-NEXT: or a2, a0, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: add a0, a5, a2
+; RV32I-NEXT: sltu a2, a0, a2
+; RV32I-NEXT: add a1, a1, a3
+; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 63
+; RV64I-NEXT: srl a4, a0, a2
+; RV64I-NEXT: srl a2, a1, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: sll a0, a0, a3
+; RV64I-NEXT: sll a1, a1, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask_multiple:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi t0, a4, 63
+; RV32ZBB-NEXT: li a6, 32
+; RV32ZBB-NEXT: neg t4, t0
+; RV32ZBB-NEXT: bltu t0, a6, .LBB23_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a7, a1, t0
+; RV32ZBB-NEXT: mv a5, a0
+; RV32ZBB-NEXT: bnez t0, .LBB23_3
+; RV32ZBB-NEXT: j .LBB23_4
+; RV32ZBB-NEXT: .LBB23_2:
+; RV32ZBB-NEXT: srl a5, a0, a4
+; RV32ZBB-NEXT: sll a7, a1, t4
+; RV32ZBB-NEXT: or a7, a5, a7
+; RV32ZBB-NEXT: mv a5, a0
+; RV32ZBB-NEXT: beqz t0, .LBB23_4
+; RV32ZBB-NEXT: .LBB23_3:
+; RV32ZBB-NEXT: mv a5, a7
+; RV32ZBB-NEXT: .LBB23_4:
+; RV32ZBB-NEXT: neg t2, t0
+; RV32ZBB-NEXT: bltu t0, a6, .LBB23_6
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li a7, 0
+; RV32ZBB-NEXT: j .LBB23_7
+; RV32ZBB-NEXT: .LBB23_6:
+; RV32ZBB-NEXT: srl a7, a1, a4
+; RV32ZBB-NEXT: .LBB23_7:
+; RV32ZBB-NEXT: andi t1, t2, 63
+; RV32ZBB-NEXT: neg t5, t1
+; RV32ZBB-NEXT: bltu t1, a6, .LBB23_9
+; RV32ZBB-NEXT: # %bb.8:
+; RV32ZBB-NEXT: li t3, 0
+; RV32ZBB-NEXT: sll a0, a0, t1
+; RV32ZBB-NEXT: bnez t1, .LBB23_10
+; RV32ZBB-NEXT: j .LBB23_11
+; RV32ZBB-NEXT: .LBB23_9:
+; RV32ZBB-NEXT: sll t3, a0, t2
+; RV32ZBB-NEXT: srl a0, a0, t5
+; RV32ZBB-NEXT: sll t6, a1, t2
+; RV32ZBB-NEXT: or a0, a0, t6
+; RV32ZBB-NEXT: beqz t1, .LBB23_11
+; RV32ZBB-NEXT: .LBB23_10:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB23_11:
+; RV32ZBB-NEXT: bltu t0, a6, .LBB23_13
+; RV32ZBB-NEXT: # %bb.12:
+; RV32ZBB-NEXT: srl t4, a3, t0
+; RV32ZBB-NEXT: mv a0, a2
+; RV32ZBB-NEXT: bnez t0, .LBB23_14
+; RV32ZBB-NEXT: j .LBB23_15
+; RV32ZBB-NEXT: .LBB23_13:
+; RV32ZBB-NEXT: srl a0, a2, a4
+; RV32ZBB-NEXT: sll t4, a3, t4
+; RV32ZBB-NEXT: or t4, a0, t4
+; RV32ZBB-NEXT: mv a0, a2
+; RV32ZBB-NEXT: beqz t0, .LBB23_15
+; RV32ZBB-NEXT: .LBB23_14:
+; RV32ZBB-NEXT: mv a0, t4
+; RV32ZBB-NEXT: .LBB23_15:
+; RV32ZBB-NEXT: bltu t0, a6, .LBB23_18
+; RV32ZBB-NEXT: # %bb.16:
+; RV32ZBB-NEXT: li a4, 0
+; RV32ZBB-NEXT: bgeu t1, a6, .LBB23_19
+; RV32ZBB-NEXT: .LBB23_17:
+; RV32ZBB-NEXT: sll a6, a2, t2
+; RV32ZBB-NEXT: srl a2, a2, t5
+; RV32ZBB-NEXT: sll t0, a3, t2
+; RV32ZBB-NEXT: or a2, a2, t0
+; RV32ZBB-NEXT: j .LBB23_20
+; RV32ZBB-NEXT: .LBB23_18:
+; RV32ZBB-NEXT: srl a4, a3, a4
+; RV32ZBB-NEXT: bltu t1, a6, .LBB23_17
+; RV32ZBB-NEXT: .LBB23_19:
+; RV32ZBB-NEXT: li a6, 0
+; RV32ZBB-NEXT: sll a2, a2, t1
+; RV32ZBB-NEXT: .LBB23_20:
+; RV32ZBB-NEXT: or a5, a5, t3
+; RV32ZBB-NEXT: or a1, a7, a1
+; RV32ZBB-NEXT: beqz t1, .LBB23_22
+; RV32ZBB-NEXT: # %bb.21:
+; RV32ZBB-NEXT: mv a3, a2
+; RV32ZBB-NEXT: .LBB23_22:
+; RV32ZBB-NEXT: or a2, a0, a6
+; RV32ZBB-NEXT: or a3, a4, a3
+; RV32ZBB-NEXT: add a0, a5, a2
+; RV32ZBB-NEXT: sltu a2, a0, a2
+; RV32ZBB-NEXT: add a1, a1, a3
+; RV32ZBB-NEXT: add a1, a1, a2
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask_multiple:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ror a0, a0, a2
+; RV64ZBB-NEXT: ror a1, a1, a2
+; RV64ZBB-NEXT: add a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask_multiple:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi t0, a4, 63
+; RV32XTHEADBB-NEXT: li a6, 32
+; RV32XTHEADBB-NEXT: neg t4, t0
+; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a7, a1, t0
+; RV32XTHEADBB-NEXT: mv a5, a0
+; RV32XTHEADBB-NEXT: bnez t0, .LBB23_3
+; RV32XTHEADBB-NEXT: j .LBB23_4
+; RV32XTHEADBB-NEXT: .LBB23_2:
+; RV32XTHEADBB-NEXT: srl a5, a0, a4
+; RV32XTHEADBB-NEXT: sll a7, a1, t4
+; RV32XTHEADBB-NEXT: or a7, a5, a7
+; RV32XTHEADBB-NEXT: mv a5, a0
+; RV32XTHEADBB-NEXT: beqz t0, .LBB23_4
+; RV32XTHEADBB-NEXT: .LBB23_3:
+; RV32XTHEADBB-NEXT: mv a5, a7
+; RV32XTHEADBB-NEXT: .LBB23_4:
+; RV32XTHEADBB-NEXT: neg t2, t0
+; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_6
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li a7, 0
+; RV32XTHEADBB-NEXT: j .LBB23_7
+; RV32XTHEADBB-NEXT: .LBB23_6:
+; RV32XTHEADBB-NEXT: srl a7, a1, a4
+; RV32XTHEADBB-NEXT: .LBB23_7:
+; RV32XTHEADBB-NEXT: andi t1, t2, 63
+; RV32XTHEADBB-NEXT: neg t5, t1
+; RV32XTHEADBB-NEXT: bltu t1, a6, .LBB23_9
+; RV32XTHEADBB-NEXT: # %bb.8:
+; RV32XTHEADBB-NEXT: li t3, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, t1
+; RV32XTHEADBB-NEXT: bnez t1, .LBB23_10
+; RV32XTHEADBB-NEXT: j .LBB23_11
+; RV32XTHEADBB-NEXT: .LBB23_9:
+; RV32XTHEADBB-NEXT: sll t3, a0, t2
+; RV32XTHEADBB-NEXT: srl a0, a0, t5
+; RV32XTHEADBB-NEXT: sll t6, a1, t2
+; RV32XTHEADBB-NEXT: or a0, a0, t6
+; RV32XTHEADBB-NEXT: beqz t1, .LBB23_11
+; RV32XTHEADBB-NEXT: .LBB23_10:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB23_11:
+; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_13
+; RV32XTHEADBB-NEXT: # %bb.12:
+; RV32XTHEADBB-NEXT: srl t4, a3, t0
+; RV32XTHEADBB-NEXT: mv a0, a2
+; RV32XTHEADBB-NEXT: bnez t0, .LBB23_14
+; RV32XTHEADBB-NEXT: j .LBB23_15
+; RV32XTHEADBB-NEXT: .LBB23_13:
+; RV32XTHEADBB-NEXT: srl a0, a2, a4
+; RV32XTHEADBB-NEXT: sll t4, a3, t4
+; RV32XTHEADBB-NEXT: or t4, a0, t4
+; RV32XTHEADBB-NEXT: mv a0, a2
+; RV32XTHEADBB-NEXT: beqz t0, .LBB23_15
+; RV32XTHEADBB-NEXT: .LBB23_14:
+; RV32XTHEADBB-NEXT: mv a0, t4
+; RV32XTHEADBB-NEXT: .LBB23_15:
+; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_18
+; RV32XTHEADBB-NEXT: # %bb.16:
+; RV32XTHEADBB-NEXT: li a4, 0
+; RV32XTHEADBB-NEXT: bgeu t1, a6, .LBB23_19
+; RV32XTHEADBB-NEXT: .LBB23_17:
+; RV32XTHEADBB-NEXT: sll a6, a2, t2
+; RV32XTHEADBB-NEXT: srl a2, a2, t5
+; RV32XTHEADBB-NEXT: sll t0, a3, t2
+; RV32XTHEADBB-NEXT: or a2, a2, t0
+; RV32XTHEADBB-NEXT: j .LBB23_20
+; RV32XTHEADBB-NEXT: .LBB23_18:
+; RV32XTHEADBB-NEXT: srl a4, a3, a4
+; RV32XTHEADBB-NEXT: bltu t1, a6, .LBB23_17
+; RV32XTHEADBB-NEXT: .LBB23_19:
+; RV32XTHEADBB-NEXT: li a6, 0
+; RV32XTHEADBB-NEXT: sll a2, a2, t1
+; RV32XTHEADBB-NEXT: .LBB23_20:
+; RV32XTHEADBB-NEXT: or a5, a5, t3
+; RV32XTHEADBB-NEXT: or a1, a7, a1
+; RV32XTHEADBB-NEXT: beqz t1, .LBB23_22
+; RV32XTHEADBB-NEXT: # %bb.21:
+; RV32XTHEADBB-NEXT: mv a3, a2
+; RV32XTHEADBB-NEXT: .LBB23_22:
+; RV32XTHEADBB-NEXT: or a2, a0, a6
+; RV32XTHEADBB-NEXT: or a3, a4, a3
+; RV32XTHEADBB-NEXT: add a0, a5, a2
+; RV32XTHEADBB-NEXT: sltu a2, a0, a2
+; RV32XTHEADBB-NEXT: add a1, a1, a3
+; RV32XTHEADBB-NEXT: add a1, a1, a2
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask_multiple:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 63
+; RV64XTHEADBB-NEXT: srl a4, a0, a2
+; RV64XTHEADBB-NEXT: srl a2, a1, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: sll a0, a0, a3
+; RV64XTHEADBB-NEXT: sll a1, a1, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: or a1, a2, a1
+; RV64XTHEADBB-NEXT: add a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i64 %amt, 63
+ %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %maskedamt)
+ %2 = tail call i64 @llvm.fshr.i64(i64 %b, i64 %b, i64 %maskedamt)
+ %3 = add i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_64_zext:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a6, 64
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: srl a7, a0, a5
+; RV32I-NEXT: bltu a2, a4, .LBB24_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: sll t1, a0, a2
+; RV32I-NEXT: j .LBB24_3
+; RV32I-NEXT: .LBB24_2:
+; RV32I-NEXT: sll a3, a0, a2
+; RV32I-NEXT: sll t0, a1, a2
+; RV32I-NEXT: or t1, a7, t0
+; RV32I-NEXT: .LBB24_3:
+; RV32I-NEXT: sub t0, a6, a2
+; RV32I-NEXT: mv a6, a1
+; RV32I-NEXT: beqz a2, .LBB24_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: .LBB24_5:
+; RV32I-NEXT: bltu t0, a4, .LBB24_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl a2, a1, t0
+; RV32I-NEXT: bnez t0, .LBB24_8
+; RV32I-NEXT: j .LBB24_9
+; RV32I-NEXT: .LBB24_7:
+; RV32I-NEXT: neg a2, t0
+; RV32I-NEXT: sll a2, a1, a2
+; RV32I-NEXT: or a2, a7, a2
+; RV32I-NEXT: beqz t0, .LBB24_9
+; RV32I-NEXT: .LBB24_8:
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: .LBB24_9:
+; RV32I-NEXT: bltu t0, a4, .LBB24_11
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB24_12
+; RV32I-NEXT: .LBB24_11:
+; RV32I-NEXT: srl a1, a1, a5
+; RV32I-NEXT: .LBB24_12:
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_zext:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 64
+; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: sll a1, a0, a1
+; RV64I-NEXT: srl a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_zext:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: li a6, 64
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: srl a7, a0, a5
+; RV32ZBB-NEXT: bltu a2, a4, .LBB24_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: sll t1, a0, a2
+; RV32ZBB-NEXT: j .LBB24_3
+; RV32ZBB-NEXT: .LBB24_2:
+; RV32ZBB-NEXT: sll a3, a0, a2
+; RV32ZBB-NEXT: sll t0, a1, a2
+; RV32ZBB-NEXT: or t1, a7, t0
+; RV32ZBB-NEXT: .LBB24_3:
+; RV32ZBB-NEXT: sub t0, a6, a2
+; RV32ZBB-NEXT: mv a6, a1
+; RV32ZBB-NEXT: beqz a2, .LBB24_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a6, t1
+; RV32ZBB-NEXT: .LBB24_5:
+; RV32ZBB-NEXT: bltu t0, a4, .LBB24_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl a2, a1, t0
+; RV32ZBB-NEXT: bnez t0, .LBB24_8
+; RV32ZBB-NEXT: j .LBB24_9
+; RV32ZBB-NEXT: .LBB24_7:
+; RV32ZBB-NEXT: neg a2, t0
+; RV32ZBB-NEXT: sll a2, a1, a2
+; RV32ZBB-NEXT: or a2, a7, a2
+; RV32ZBB-NEXT: beqz t0, .LBB24_9
+; RV32ZBB-NEXT: .LBB24_8:
+; RV32ZBB-NEXT: mv a0, a2
+; RV32ZBB-NEXT: .LBB24_9:
+; RV32ZBB-NEXT: bltu t0, a4, .LBB24_11
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: j .LBB24_12
+; RV32ZBB-NEXT: .LBB24_11:
+; RV32ZBB-NEXT: srl a1, a1, a5
+; RV32ZBB-NEXT: .LBB24_12:
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: or a1, a6, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_zext:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: li a2, 64
+; RV64ZBB-NEXT: sub a2, a2, a1
+; RV64ZBB-NEXT: sll a1, a0, a1
+; RV64ZBB-NEXT: srl a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_zext:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: li a6, 64
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: neg a5, a2
+; RV32XTHEADBB-NEXT: srl a7, a0, a5
+; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB24_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: sll t1, a0, a2
+; RV32XTHEADBB-NEXT: j .LBB24_3
+; RV32XTHEADBB-NEXT: .LBB24_2:
+; RV32XTHEADBB-NEXT: sll a3, a0, a2
+; RV32XTHEADBB-NEXT: sll t0, a1, a2
+; RV32XTHEADBB-NEXT: or t1, a7, t0
+; RV32XTHEADBB-NEXT: .LBB24_3:
+; RV32XTHEADBB-NEXT: sub t0, a6, a2
+; RV32XTHEADBB-NEXT: mv a6, a1
+; RV32XTHEADBB-NEXT: beqz a2, .LBB24_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a6, t1
+; RV32XTHEADBB-NEXT: .LBB24_5:
+; RV32XTHEADBB-NEXT: bltu t0, a4, .LBB24_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl a2, a1, t0
+; RV32XTHEADBB-NEXT: bnez t0, .LBB24_8
+; RV32XTHEADBB-NEXT: j .LBB24_9
+; RV32XTHEADBB-NEXT: .LBB24_7:
+; RV32XTHEADBB-NEXT: neg a2, t0
+; RV32XTHEADBB-NEXT: sll a2, a1, a2
+; RV32XTHEADBB-NEXT: or a2, a7, a2
+; RV32XTHEADBB-NEXT: beqz t0, .LBB24_9
+; RV32XTHEADBB-NEXT: .LBB24_8:
+; RV32XTHEADBB-NEXT: mv a0, a2
+; RV32XTHEADBB-NEXT: .LBB24_9:
+; RV32XTHEADBB-NEXT: bltu t0, a4, .LBB24_11
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: j .LBB24_12
+; RV32XTHEADBB-NEXT: .LBB24_11:
+; RV32XTHEADBB-NEXT: srl a1, a1, a5
+; RV32XTHEADBB-NEXT: .LBB24_12:
+; RV32XTHEADBB-NEXT: or a0, a3, a0
+; RV32XTHEADBB-NEXT: or a1, a6, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_zext:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: li a2, 64
+; RV64XTHEADBB-NEXT: sub a2, a2, a1
+; RV64XTHEADBB-NEXT: sll a1, a0, a1
+; RV64XTHEADBB-NEXT: srl a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 64, %y
+ %zext = zext i32 %z to i64
+ %zexty = zext i32 %y to i64
+ %b = shl i64 %x, %zexty
+ %c = lshr i64 %x, %zext
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_64_zext:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a5, 32
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: sll a4, a1, a6
+; RV32I-NEXT: bltu a2, a5, .LBB25_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, a1, a2
+; RV32I-NEXT: j .LBB25_3
+; RV32I-NEXT: .LBB25_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: or a7, a3, a4
+; RV32I-NEXT: .LBB25_3:
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a2, .LBB25_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB25_5:
+; RV32I-NEXT: sub a7, t0, a2
+; RV32I-NEXT: bltu a2, a5, .LBB25_8
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: bgeu a7, a5, .LBB25_9
+; RV32I-NEXT: .LBB25_7:
+; RV32I-NEXT: sll a5, a0, a6
+; RV32I-NEXT: neg a6, a7
+; RV32I-NEXT: srl a0, a0, a6
+; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: bnez a7, .LBB25_10
+; RV32I-NEXT: j .LBB25_11
+; RV32I-NEXT: .LBB25_8:
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: bltu a7, a5, .LBB25_7
+; RV32I-NEXT: .LBB25_9:
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: sll a0, a0, a7
+; RV32I-NEXT: beqz a7, .LBB25_11
+; RV32I-NEXT: .LBB25_10:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB25_11:
+; RV32I-NEXT: or a0, a3, a5
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_zext:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 64
+; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: srl a1, a0, a1
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_zext:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: li a5, 32
+; RV32ZBB-NEXT: neg a6, a2
+; RV32ZBB-NEXT: sll a4, a1, a6
+; RV32ZBB-NEXT: bltu a2, a5, .LBB25_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a7, a1, a2
+; RV32ZBB-NEXT: j .LBB25_3
+; RV32ZBB-NEXT: .LBB25_2:
+; RV32ZBB-NEXT: srl a3, a0, a2
+; RV32ZBB-NEXT: or a7, a3, a4
+; RV32ZBB-NEXT: .LBB25_3:
+; RV32ZBB-NEXT: li t0, 64
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: beqz a2, .LBB25_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a3, a7
+; RV32ZBB-NEXT: .LBB25_5:
+; RV32ZBB-NEXT: sub a7, t0, a2
+; RV32ZBB-NEXT: bltu a2, a5, .LBB25_8
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: li a2, 0
+; RV32ZBB-NEXT: bgeu a7, a5, .LBB25_9
+; RV32ZBB-NEXT: .LBB25_7:
+; RV32ZBB-NEXT: sll a5, a0, a6
+; RV32ZBB-NEXT: neg a6, a7
+; RV32ZBB-NEXT: srl a0, a0, a6
+; RV32ZBB-NEXT: or a0, a0, a4
+; RV32ZBB-NEXT: bnez a7, .LBB25_10
+; RV32ZBB-NEXT: j .LBB25_11
+; RV32ZBB-NEXT: .LBB25_8:
+; RV32ZBB-NEXT: srl a2, a1, a2
+; RV32ZBB-NEXT: bltu a7, a5, .LBB25_7
+; RV32ZBB-NEXT: .LBB25_9:
+; RV32ZBB-NEXT: li a5, 0
+; RV32ZBB-NEXT: sll a0, a0, a7
+; RV32ZBB-NEXT: beqz a7, .LBB25_11
+; RV32ZBB-NEXT: .LBB25_10:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB25_11:
+; RV32ZBB-NEXT: or a0, a3, a5
+; RV32ZBB-NEXT: or a1, a2, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_zext:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: li a2, 64
+; RV64ZBB-NEXT: sub a2, a2, a1
+; RV64ZBB-NEXT: srl a1, a0, a1
+; RV64ZBB-NEXT: sll a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_zext:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: li a5, 32
+; RV32XTHEADBB-NEXT: neg a6, a2
+; RV32XTHEADBB-NEXT: sll a4, a1, a6
+; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB25_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a7, a1, a2
+; RV32XTHEADBB-NEXT: j .LBB25_3
+; RV32XTHEADBB-NEXT: .LBB25_2:
+; RV32XTHEADBB-NEXT: srl a3, a0, a2
+; RV32XTHEADBB-NEXT: or a7, a3, a4
+; RV32XTHEADBB-NEXT: .LBB25_3:
+; RV32XTHEADBB-NEXT: li t0, 64
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: beqz a2, .LBB25_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a3, a7
+; RV32XTHEADBB-NEXT: .LBB25_5:
+; RV32XTHEADBB-NEXT: sub a7, t0, a2
+; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB25_8
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: li a2, 0
+; RV32XTHEADBB-NEXT: bgeu a7, a5, .LBB25_9
+; RV32XTHEADBB-NEXT: .LBB25_7:
+; RV32XTHEADBB-NEXT: sll a5, a0, a6
+; RV32XTHEADBB-NEXT: neg a6, a7
+; RV32XTHEADBB-NEXT: srl a0, a0, a6
+; RV32XTHEADBB-NEXT: or a0, a0, a4
+; RV32XTHEADBB-NEXT: bnez a7, .LBB25_10
+; RV32XTHEADBB-NEXT: j .LBB25_11
+; RV32XTHEADBB-NEXT: .LBB25_8:
+; RV32XTHEADBB-NEXT: srl a2, a1, a2
+; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB25_7
+; RV32XTHEADBB-NEXT: .LBB25_9:
+; RV32XTHEADBB-NEXT: li a5, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, a7
+; RV32XTHEADBB-NEXT: beqz a7, .LBB25_11
+; RV32XTHEADBB-NEXT: .LBB25_10:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB25_11:
+; RV32XTHEADBB-NEXT: or a0, a3, a5
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_zext:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: li a2, 64
+; RV64XTHEADBB-NEXT: sub a2, a2, a1
+; RV64XTHEADBB-NEXT: srl a1, a0, a1
+; RV64XTHEADBB-NEXT: sll a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 64, %y
+ %zext = zext i32 %z to i64
+ %zexty = zext i32 %y to i64
+ %b = lshr i64 %x, %zexty
+ %c = shl i64 %x, %zext
+ %d = or i64 %b, %c
+ ret i64 %d
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
new file mode 100644
index 000000000000000..71a5ecc77a1b08b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
@@ -0,0 +1,962 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV64I
+
+; Basic shift support is tested as part of ALU.ll. This file ensures that
+; shifts which may not be supported natively are lowered properly.
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare i128 @llvm.fshr.i128(i128, i128, i128)
+
+define i64 @lshr64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: lshr64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB0_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB0_3
+; RV32I-NEXT: j .LBB0_4
+; RV32I-NEXT: .LBB0_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB0_4
+; RV32I-NEXT: .LBB0_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB0_4:
+; RV32I-NEXT: bltu a2, a3, .LBB0_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB0_6:
+; RV32I-NEXT: srl a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: lshr64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = lshr i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @lshr64_minsize(i64 %a, i64 %b) minsize nounwind {
+; RV32I-LABEL: lshr64_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB1_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB1_3
+; RV32I-NEXT: j .LBB1_4
+; RV32I-NEXT: .LBB1_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB1_4
+; RV32I-NEXT: .LBB1_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB1_4:
+; RV32I-NEXT: bltu a2, a3, .LBB1_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB1_6:
+; RV32I-NEXT: srl a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: lshr64_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = lshr i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @ashr64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: ashr64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB2_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB2_3
+; RV32I-NEXT: j .LBB2_4
+; RV32I-NEXT: .LBB2_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB2_4
+; RV32I-NEXT: .LBB2_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB2_4:
+; RV32I-NEXT: bltu a2, a3, .LBB2_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srai a1, a1, 31
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB2_6:
+; RV32I-NEXT: sra a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ashr64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sra a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = ashr i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @ashr64_minsize(i64 %a, i64 %b) minsize nounwind {
+; RV32I-LABEL: ashr64_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB3_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB3_3
+; RV32I-NEXT: j .LBB3_4
+; RV32I-NEXT: .LBB3_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB3_4
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: bltu a2, a3, .LBB3_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srai a1, a1, 31
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB3_6:
+; RV32I-NEXT: sra a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ashr64_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sra a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = ashr i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @shl64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: shl64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: bltu a2, a0, .LBB4_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: bnez a2, .LBB4_3
+; RV32I-NEXT: j .LBB4_4
+; RV32I-NEXT: .LBB4_2:
+; RV32I-NEXT: sll a0, a3, a2
+; RV32I-NEXT: neg a4, a2
+; RV32I-NEXT: srl a3, a3, a4
+; RV32I-NEXT: sll a4, a1, a2
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: beqz a2, .LBB4_4
+; RV32I-NEXT: .LBB4_3:
+; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: .LBB4_4:
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: shl64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = shl i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @shl64_minsize(i64 %a, i64 %b) minsize nounwind {
+; RV32I-LABEL: shl64_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: bltu a2, a0, .LBB5_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: bnez a2, .LBB5_3
+; RV32I-NEXT: j .LBB5_4
+; RV32I-NEXT: .LBB5_2:
+; RV32I-NEXT: sll a0, a3, a2
+; RV32I-NEXT: neg a4, a2
+; RV32I-NEXT: srl a3, a3, a4
+; RV32I-NEXT: sll a4, a1, a2
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: beqz a2, .LBB5_4
+; RV32I-NEXT: .LBB5_3:
+; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: .LBB5_4:
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: shl64_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = shl i64 %a, %b
+ ret i64 %1
+}
+
+define i128 @lshr128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: lshr128:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a2, 0(a2)
+; RV32I-NEXT: lw a3, 8(a1)
+; RV32I-NEXT: lw a7, 12(a1)
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: srl t2, a3, a2
+; RV32I-NEXT: neg t6, a2
+; RV32I-NEXT: sll t5, a7, t6
+; RV32I-NEXT: bltu a2, t0, .LBB6_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a5, a7, a2
+; RV32I-NEXT: mv a4, a3
+; RV32I-NEXT: bnez a2, .LBB6_3
+; RV32I-NEXT: j .LBB6_4
+; RV32I-NEXT: .LBB6_2:
+; RV32I-NEXT: or a5, t2, t5
+; RV32I-NEXT: mv a4, a3
+; RV32I-NEXT: beqz a2, .LBB6_4
+; RV32I-NEXT: .LBB6_3:
+; RV32I-NEXT: mv a4, a5
+; RV32I-NEXT: .LBB6_4:
+; RV32I-NEXT: lw a5, 0(a1)
+; RV32I-NEXT: lw a1, 4(a1)
+; RV32I-NEXT: bltu a2, t0, .LBB6_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: srl t4, a1, a2
+; RV32I-NEXT: j .LBB6_7
+; RV32I-NEXT: .LBB6_6:
+; RV32I-NEXT: srl a6, a7, a2
+; RV32I-NEXT: srl t1, a5, a2
+; RV32I-NEXT: sll t3, a1, t6
+; RV32I-NEXT: or t4, t1, t3
+; RV32I-NEXT: .LBB6_7:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a5
+; RV32I-NEXT: beqz a2, .LBB6_9
+; RV32I-NEXT: # %bb.8:
+; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: .LBB6_9:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s0, t1, a2
+; RV32I-NEXT: bltu a2, t0, .LBB6_12
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: bgeu s0, t0, .LBB6_13
+; RV32I-NEXT: .LBB6_11:
+; RV32I-NEXT: sll t6, a3, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a3, s1
+; RV32I-NEXT: or s2, s1, t5
+; RV32I-NEXT: j .LBB6_14
+; RV32I-NEXT: .LBB6_12:
+; RV32I-NEXT: srl t4, a1, a2
+; RV32I-NEXT: bltu s0, t0, .LBB6_11
+; RV32I-NEXT: .LBB6_13:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a3, s0
+; RV32I-NEXT: .LBB6_14:
+; RV32I-NEXT: addi s1, a2, -64
+; RV32I-NEXT: mv t5, a7
+; RV32I-NEXT: beqz s0, .LBB6_16
+; RV32I-NEXT: # %bb.15:
+; RV32I-NEXT: mv t5, s2
+; RV32I-NEXT: .LBB6_16:
+; RV32I-NEXT: bltu s1, t0, .LBB6_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: srl t2, a7, s1
+; RV32I-NEXT: bnez s1, .LBB6_19
+; RV32I-NEXT: j .LBB6_20
+; RV32I-NEXT: .LBB6_18:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, a7, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB6_20
+; RV32I-NEXT: .LBB6_19:
+; RV32I-NEXT: mv a3, t2
+; RV32I-NEXT: .LBB6_20:
+; RV32I-NEXT: bltu s1, t0, .LBB6_22
+; RV32I-NEXT: # %bb.21:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: bltu a2, t1, .LBB6_23
+; RV32I-NEXT: j .LBB6_24
+; RV32I-NEXT: .LBB6_22:
+; RV32I-NEXT: srl a7, a7, a2
+; RV32I-NEXT: bgeu a2, t1, .LBB6_24
+; RV32I-NEXT: .LBB6_23:
+; RV32I-NEXT: or a3, t3, t6
+; RV32I-NEXT: or a7, t4, t5
+; RV32I-NEXT: .LBB6_24:
+; RV32I-NEXT: bnez a2, .LBB6_28
+; RV32I-NEXT: # %bb.25:
+; RV32I-NEXT: bltu a2, t1, .LBB6_27
+; RV32I-NEXT: .LBB6_26:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: .LBB6_27:
+; RV32I-NEXT: sw a5, 0(a0)
+; RV32I-NEXT: sw a1, 4(a0)
+; RV32I-NEXT: sw a4, 8(a0)
+; RV32I-NEXT: sw a6, 12(a0)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB6_28:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: mv a1, a7
+; RV32I-NEXT: bgeu a2, t1, .LBB6_26
+; RV32I-NEXT: j .LBB6_27
+;
+; RV64I-LABEL: lshr128:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a3, 64
+; RV64I-NEXT: bltu a2, a3, .LBB6_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a4, a2, a3
+; RV64I-NEXT: srl a4, a1, a4
+; RV64I-NEXT: bnez a2, .LBB6_3
+; RV64I-NEXT: j .LBB6_4
+; RV64I-NEXT: .LBB6_2:
+; RV64I-NEXT: srl a4, a0, a2
+; RV64I-NEXT: negw a5, a2
+; RV64I-NEXT: sll a5, a1, a5
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: beqz a2, .LBB6_4
+; RV64I-NEXT: .LBB6_3:
+; RV64I-NEXT: mv a0, a4
+; RV64I-NEXT: .LBB6_4:
+; RV64I-NEXT: bltu a2, a3, .LBB6_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB6_6:
+; RV64I-NEXT: srl a1, a1, a2
+; RV64I-NEXT: ret
+ %1 = lshr i128 %a, %b
+ ret i128 %1
+}
+
+define i128 @ashr128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: ashr128:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a2, 0(a2)
+; RV32I-NEXT: lw a4, 8(a1)
+; RV32I-NEXT: lw a3, 12(a1)
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: srl t2, a4, a2
+; RV32I-NEXT: neg t6, a2
+; RV32I-NEXT: sll t5, a3, t6
+; RV32I-NEXT: bltu a2, t0, .LBB7_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra a6, a3, a2
+; RV32I-NEXT: mv a5, a4
+; RV32I-NEXT: bnez a2, .LBB7_3
+; RV32I-NEXT: j .LBB7_4
+; RV32I-NEXT: .LBB7_2:
+; RV32I-NEXT: or a6, t2, t5
+; RV32I-NEXT: mv a5, a4
+; RV32I-NEXT: beqz a2, .LBB7_4
+; RV32I-NEXT: .LBB7_3:
+; RV32I-NEXT: mv a5, a6
+; RV32I-NEXT: .LBB7_4:
+; RV32I-NEXT: lw a6, 0(a1)
+; RV32I-NEXT: lw a1, 4(a1)
+; RV32I-NEXT: bltu a2, t0, .LBB7_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srai a7, a3, 31
+; RV32I-NEXT: srl t4, a1, a2
+; RV32I-NEXT: j .LBB7_7
+; RV32I-NEXT: .LBB7_6:
+; RV32I-NEXT: sra a7, a3, a2
+; RV32I-NEXT: srl t1, a6, a2
+; RV32I-NEXT: sll t3, a1, t6
+; RV32I-NEXT: or t4, t1, t3
+; RV32I-NEXT: .LBB7_7:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a6
+; RV32I-NEXT: beqz a2, .LBB7_9
+; RV32I-NEXT: # %bb.8:
+; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: .LBB7_9:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s0, t1, a2
+; RV32I-NEXT: bltu a2, t0, .LBB7_12
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: bgeu s0, t0, .LBB7_13
+; RV32I-NEXT: .LBB7_11:
+; RV32I-NEXT: sll t6, a4, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a4, s1
+; RV32I-NEXT: or s2, s1, t5
+; RV32I-NEXT: j .LBB7_14
+; RV32I-NEXT: .LBB7_12:
+; RV32I-NEXT: srl t4, a1, a2
+; RV32I-NEXT: bltu s0, t0, .LBB7_11
+; RV32I-NEXT: .LBB7_13:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a4, s0
+; RV32I-NEXT: .LBB7_14:
+; RV32I-NEXT: addi s1, a2, -64
+; RV32I-NEXT: mv t5, a3
+; RV32I-NEXT: beqz s0, .LBB7_16
+; RV32I-NEXT: # %bb.15:
+; RV32I-NEXT: mv t5, s2
+; RV32I-NEXT: .LBB7_16:
+; RV32I-NEXT: bltu s1, t0, .LBB7_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: sra t2, a3, s1
+; RV32I-NEXT: bnez s1, .LBB7_19
+; RV32I-NEXT: j .LBB7_20
+; RV32I-NEXT: .LBB7_18:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, a3, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB7_20
+; RV32I-NEXT: .LBB7_19:
+; RV32I-NEXT: mv a4, t2
+; RV32I-NEXT: .LBB7_20:
+; RV32I-NEXT: bltu s1, t0, .LBB7_22
+; RV32I-NEXT: # %bb.21:
+; RV32I-NEXT: srai t0, a3, 31
+; RV32I-NEXT: bltu a2, t1, .LBB7_23
+; RV32I-NEXT: j .LBB7_24
+; RV32I-NEXT: .LBB7_22:
+; RV32I-NEXT: sra t0, a3, a2
+; RV32I-NEXT: bgeu a2, t1, .LBB7_24
+; RV32I-NEXT: .LBB7_23:
+; RV32I-NEXT: or a4, t3, t6
+; RV32I-NEXT: or t0, t4, t5
+; RV32I-NEXT: .LBB7_24:
+; RV32I-NEXT: bnez a2, .LBB7_28
+; RV32I-NEXT: # %bb.25:
+; RV32I-NEXT: bltu a2, t1, .LBB7_27
+; RV32I-NEXT: .LBB7_26:
+; RV32I-NEXT: srai a5, a3, 31
+; RV32I-NEXT: mv a7, a5
+; RV32I-NEXT: .LBB7_27:
+; RV32I-NEXT: sw a6, 0(a0)
+; RV32I-NEXT: sw a1, 4(a0)
+; RV32I-NEXT: sw a5, 8(a0)
+; RV32I-NEXT: sw a7, 12(a0)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB7_28:
+; RV32I-NEXT: mv a6, a4
+; RV32I-NEXT: mv a1, t0
+; RV32I-NEXT: bgeu a2, t1, .LBB7_26
+; RV32I-NEXT: j .LBB7_27
+;
+; RV64I-LABEL: ashr128:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a3, 64
+; RV64I-NEXT: bltu a2, a3, .LBB7_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a4, a2, a3
+; RV64I-NEXT: sra a4, a1, a4
+; RV64I-NEXT: bnez a2, .LBB7_3
+; RV64I-NEXT: j .LBB7_4
+; RV64I-NEXT: .LBB7_2:
+; RV64I-NEXT: srl a4, a0, a2
+; RV64I-NEXT: negw a5, a2
+; RV64I-NEXT: sll a5, a1, a5
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: beqz a2, .LBB7_4
+; RV64I-NEXT: .LBB7_3:
+; RV64I-NEXT: mv a0, a4
+; RV64I-NEXT: .LBB7_4:
+; RV64I-NEXT: bltu a2, a3, .LBB7_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: srai a1, a1, 63
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB7_6:
+; RV64I-NEXT: sra a1, a1, a2
+; RV64I-NEXT: ret
+ %1 = ashr i128 %a, %b
+ ret i128 %1
+}
+
+define i128 @shl128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: shl128:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a2, 0(a2)
+; RV32I-NEXT: lw a7, 0(a1)
+; RV32I-NEXT: lw a3, 4(a1)
+; RV32I-NEXT: li a6, 64
+; RV32I-NEXT: li t1, 32
+; RV32I-NEXT: neg t5, a2
+; RV32I-NEXT: srl t2, a7, t5
+; RV32I-NEXT: sll t0, a3, a2
+; RV32I-NEXT: bltu a2, t1, .LBB8_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: sll t3, a7, a2
+; RV32I-NEXT: j .LBB8_3
+; RV32I-NEXT: .LBB8_2:
+; RV32I-NEXT: sll a4, a7, a2
+; RV32I-NEXT: or t3, t2, t0
+; RV32I-NEXT: .LBB8_3:
+; RV32I-NEXT: sub t4, a6, a2
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: beqz a2, .LBB8_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a5, t3
+; RV32I-NEXT: .LBB8_5:
+; RV32I-NEXT: bltu t4, t1, .LBB8_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t2, a3, t4
+; RV32I-NEXT: mv t3, a7
+; RV32I-NEXT: bnez t4, .LBB8_8
+; RV32I-NEXT: j .LBB8_9
+; RV32I-NEXT: .LBB8_7:
+; RV32I-NEXT: neg t3, t4
+; RV32I-NEXT: sll t3, a3, t3
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: mv t3, a7
+; RV32I-NEXT: beqz t4, .LBB8_9
+; RV32I-NEXT: .LBB8_8:
+; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: .LBB8_9:
+; RV32I-NEXT: bltu t4, t1, .LBB8_11
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: j .LBB8_12
+; RV32I-NEXT: .LBB8_11:
+; RV32I-NEXT: srl t4, a3, t5
+; RV32I-NEXT: .LBB8_12:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lw t2, 8(a1)
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: bltu a2, t1, .LBB8_14
+; RV32I-NEXT: # %bb.13:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s1, t2, a2
+; RV32I-NEXT: j .LBB8_15
+; RV32I-NEXT: .LBB8_14:
+; RV32I-NEXT: sll t6, t2, a2
+; RV32I-NEXT: srl t5, t2, t5
+; RV32I-NEXT: sll s0, a1, a2
+; RV32I-NEXT: or s1, t5, s0
+; RV32I-NEXT: .LBB8_15:
+; RV32I-NEXT: addi s0, a2, -64
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: beqz a2, .LBB8_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t5, s1
+; RV32I-NEXT: .LBB8_17:
+; RV32I-NEXT: bltu s0, t1, .LBB8_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: li t1, 0
+; RV32I-NEXT: sll a7, a7, s0
+; RV32I-NEXT: bnez s0, .LBB8_20
+; RV32I-NEXT: j .LBB8_21
+; RV32I-NEXT: .LBB8_19:
+; RV32I-NEXT: sll t1, a7, a2
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl a7, a7, s1
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: beqz s0, .LBB8_21
+; RV32I-NEXT: .LBB8_20:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB8_21:
+; RV32I-NEXT: bltu a2, a6, .LBB8_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: bnez a2, .LBB8_24
+; RV32I-NEXT: j .LBB8_25
+; RV32I-NEXT: .LBB8_23:
+; RV32I-NEXT: or t1, t3, t6
+; RV32I-NEXT: or a3, t4, t5
+; RV32I-NEXT: beqz a2, .LBB8_25
+; RV32I-NEXT: .LBB8_24:
+; RV32I-NEXT: mv t2, t1
+; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: .LBB8_25:
+; RV32I-NEXT: sw a4, 0(a0)
+; RV32I-NEXT: sw a5, 4(a0)
+; RV32I-NEXT: sw t2, 8(a0)
+; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: shl128:
+; RV64I: # %bb.0:
+; RV64I-NEXT: mv a3, a0
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: bltu a2, a4, .LBB8_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: sub a4, a2, a4
+; RV64I-NEXT: sll a3, a3, a4
+; RV64I-NEXT: bnez a2, .LBB8_3
+; RV64I-NEXT: j .LBB8_4
+; RV64I-NEXT: .LBB8_2:
+; RV64I-NEXT: sll a0, a3, a2
+; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: srl a3, a3, a4
+; RV64I-NEXT: sll a4, a1, a2
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: beqz a2, .LBB8_4
+; RV64I-NEXT: .LBB8_3:
+; RV64I-NEXT: mv a1, a3
+; RV64I-NEXT: .LBB8_4:
+; RV64I-NEXT: ret
+ %1 = shl i128 %a, %b
+ ret i128 %1
+}
+
+define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
+; RV32I-LABEL: fshr64_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a5, a2, 63
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a5, a4, .LBB9_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a6, a1, a5
+; RV32I-NEXT: j .LBB9_3
+; RV32I-NEXT: .LBB9_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: neg a6, a5
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a6, a3, a6
+; RV32I-NEXT: .LBB9_3:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a5, .LBB9_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: .LBB9_5:
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: bltu a5, a4, .LBB9_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: j .LBB9_8
+; RV32I-NEXT: .LBB9_7:
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: .LBB9_8:
+; RV32I-NEXT: andi a5, a6, 63
+; RV32I-NEXT: bltu a5, a4, .LBB9_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: sll a0, a0, a5
+; RV32I-NEXT: bnez a5, .LBB9_11
+; RV32I-NEXT: j .LBB9_12
+; RV32I-NEXT: .LBB9_10:
+; RV32I-NEXT: sll a4, a0, a6
+; RV32I-NEXT: neg a7, a5
+; RV32I-NEXT: srl a0, a0, a7
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: beqz a5, .LBB9_12
+; RV32I-NEXT: .LBB9_11:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB9_12:
+; RV32I-NEXT: or a0, a3, a4
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: fshr64_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srl a1, a0, a1
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+ %res = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
+ ret i64 %res
+}
+
+define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
+; RV32I-LABEL: fshr128_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw t3, 0(a2)
+; RV32I-NEXT: lw a2, 8(a1)
+; RV32I-NEXT: lw a3, 12(a1)
+; RV32I-NEXT: andi t4, t3, 127
+; RV32I-NEXT: li a6, 32
+; RV32I-NEXT: neg t6, t4
+; RV32I-NEXT: sll t5, a3, t6
+; RV32I-NEXT: bltu t4, a6, .LBB10_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a5, a3, t4
+; RV32I-NEXT: j .LBB10_3
+; RV32I-NEXT: .LBB10_2:
+; RV32I-NEXT: srl a4, a2, t3
+; RV32I-NEXT: or a5, a4, t5
+; RV32I-NEXT: .LBB10_3:
+; RV32I-NEXT: mv a4, a2
+; RV32I-NEXT: beqz t4, .LBB10_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a4, a5
+; RV32I-NEXT: .LBB10_5:
+; RV32I-NEXT: lw a7, 0(a1)
+; RV32I-NEXT: lw a5, 4(a1)
+; RV32I-NEXT: bltu t4, a6, .LBB10_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: srl t2, a5, t4
+; RV32I-NEXT: j .LBB10_8
+; RV32I-NEXT: .LBB10_7:
+; RV32I-NEXT: srl a1, a3, t3
+; RV32I-NEXT: srl t0, a7, t3
+; RV32I-NEXT: sll t1, a5, t6
+; RV32I-NEXT: or t2, t0, t1
+; RV32I-NEXT: .LBB10_8:
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: mv t1, a7
+; RV32I-NEXT: beqz t4, .LBB10_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t1, t2
+; RV32I-NEXT: .LBB10_10:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s0, t0, t4
+; RV32I-NEXT: bltu t4, a6, .LBB10_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: bgeu s0, a6, .LBB10_14
+; RV32I-NEXT: .LBB10_12:
+; RV32I-NEXT: sll t6, a2, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a2, s1
+; RV32I-NEXT: or s2, s1, t5
+; RV32I-NEXT: j .LBB10_15
+; RV32I-NEXT: .LBB10_13:
+; RV32I-NEXT: srl t2, a5, t3
+; RV32I-NEXT: bltu s0, a6, .LBB10_12
+; RV32I-NEXT: .LBB10_14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a2, s0
+; RV32I-NEXT: .LBB10_15:
+; RV32I-NEXT: addi s1, t4, -64
+; RV32I-NEXT: mv t5, a3
+; RV32I-NEXT: beqz s0, .LBB10_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t5, s2
+; RV32I-NEXT: .LBB10_17:
+; RV32I-NEXT: bltu s1, a6, .LBB10_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: srl s2, a3, s1
+; RV32I-NEXT: j .LBB10_20
+; RV32I-NEXT: .LBB10_19:
+; RV32I-NEXT: srl s0, a2, t4
+; RV32I-NEXT: neg s2, s1
+; RV32I-NEXT: sll s2, a3, s2
+; RV32I-NEXT: or s2, s0, s2
+; RV32I-NEXT: .LBB10_20:
+; RV32I-NEXT: mv s0, a2
+; RV32I-NEXT: beqz s1, .LBB10_22
+; RV32I-NEXT: # %bb.21:
+; RV32I-NEXT: mv s0, s2
+; RV32I-NEXT: .LBB10_22:
+; RV32I-NEXT: bltu s1, a6, .LBB10_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bltu t4, t0, .LBB10_25
+; RV32I-NEXT: j .LBB10_26
+; RV32I-NEXT: .LBB10_24:
+; RV32I-NEXT: srl s1, a3, t4
+; RV32I-NEXT: bgeu t4, t0, .LBB10_26
+; RV32I-NEXT: .LBB10_25:
+; RV32I-NEXT: or s0, t1, t6
+; RV32I-NEXT: or s1, t2, t5
+; RV32I-NEXT: .LBB10_26:
+; RV32I-NEXT: mv t1, a7
+; RV32I-NEXT: mv t2, a5
+; RV32I-NEXT: beqz t4, .LBB10_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: mv t1, s0
+; RV32I-NEXT: mv t2, s1
+; RV32I-NEXT: .LBB10_28:
+; RV32I-NEXT: neg t6, t3
+; RV32I-NEXT: bltu t4, t0, .LBB10_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB10_30:
+; RV32I-NEXT: andi t3, t6, 127
+; RV32I-NEXT: neg s2, t3
+; RV32I-NEXT: srl s0, a7, s2
+; RV32I-NEXT: bltu t3, a6, .LBB10_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: sll s3, a7, t3
+; RV32I-NEXT: j .LBB10_33
+; RV32I-NEXT: .LBB10_32:
+; RV32I-NEXT: sll t4, a7, t6
+; RV32I-NEXT: sll t5, a5, t6
+; RV32I-NEXT: or s3, s0, t5
+; RV32I-NEXT: .LBB10_33:
+; RV32I-NEXT: sub s1, t0, t3
+; RV32I-NEXT: mv t5, a5
+; RV32I-NEXT: beqz t3, .LBB10_35
+; RV32I-NEXT: # %bb.34:
+; RV32I-NEXT: mv t5, s3
+; RV32I-NEXT: .LBB10_35:
+; RV32I-NEXT: bltu s1, a6, .LBB10_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: srl s3, a5, s1
+; RV32I-NEXT: j .LBB10_38
+; RV32I-NEXT: .LBB10_37:
+; RV32I-NEXT: neg s3, s1
+; RV32I-NEXT: sll s3, a5, s3
+; RV32I-NEXT: or s3, s0, s3
+; RV32I-NEXT: .LBB10_38:
+; RV32I-NEXT: mv s0, a7
+; RV32I-NEXT: beqz s1, .LBB10_40
+; RV32I-NEXT: # %bb.39:
+; RV32I-NEXT: mv s0, s3
+; RV32I-NEXT: .LBB10_40:
+; RV32I-NEXT: bltu s1, a6, .LBB10_43
+; RV32I-NEXT: # %bb.41:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu t3, a6, .LBB10_44
+; RV32I-NEXT: .LBB10_42:
+; RV32I-NEXT: sll s3, a2, t6
+; RV32I-NEXT: srl s2, a2, s2
+; RV32I-NEXT: sll t6, a3, t6
+; RV32I-NEXT: or s4, s2, t6
+; RV32I-NEXT: j .LBB10_45
+; RV32I-NEXT: .LBB10_43:
+; RV32I-NEXT: srl s1, a5, s2
+; RV32I-NEXT: bltu t3, a6, .LBB10_42
+; RV32I-NEXT: .LBB10_44:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll s4, a2, t3
+; RV32I-NEXT: .LBB10_45:
+; RV32I-NEXT: addi s2, t3, -64
+; RV32I-NEXT: mv t6, a3
+; RV32I-NEXT: beqz t3, .LBB10_47
+; RV32I-NEXT: # %bb.46:
+; RV32I-NEXT: mv t6, s4
+; RV32I-NEXT: .LBB10_47:
+; RV32I-NEXT: bltu s2, a6, .LBB10_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a7, a7, s2
+; RV32I-NEXT: bnez s2, .LBB10_50
+; RV32I-NEXT: j .LBB10_51
+; RV32I-NEXT: .LBB10_49:
+; RV32I-NEXT: sll a6, a7, t3
+; RV32I-NEXT: neg s4, s2
+; RV32I-NEXT: srl a7, a7, s4
+; RV32I-NEXT: sll s4, a5, t3
+; RV32I-NEXT: or a7, a7, s4
+; RV32I-NEXT: beqz s2, .LBB10_51
+; RV32I-NEXT: .LBB10_50:
+; RV32I-NEXT: mv a5, a7
+; RV32I-NEXT: .LBB10_51:
+; RV32I-NEXT: bltu t3, t0, .LBB10_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: bnez t3, .LBB10_54
+; RV32I-NEXT: j .LBB10_55
+; RV32I-NEXT: .LBB10_53:
+; RV32I-NEXT: or a6, s0, s3
+; RV32I-NEXT: or a5, s1, t6
+; RV32I-NEXT: beqz t3, .LBB10_55
+; RV32I-NEXT: .LBB10_54:
+; RV32I-NEXT: mv a2, a6
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: .LBB10_55:
+; RV32I-NEXT: or a5, t1, t4
+; RV32I-NEXT: or a6, t2, t5
+; RV32I-NEXT: or a2, a4, a2
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: sw a5, 0(a0)
+; RV32I-NEXT: sw a6, 4(a0)
+; RV32I-NEXT: sw a2, 8(a0)
+; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: fshr128_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a5, a2, 127
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: bltu a5, a4, .LBB10_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a3, a5, a4
+; RV64I-NEXT: srl a6, a1, a3
+; RV64I-NEXT: j .LBB10_3
+; RV64I-NEXT: .LBB10_2:
+; RV64I-NEXT: srl a3, a0, a2
+; RV64I-NEXT: negw a6, a5
+; RV64I-NEXT: sll a6, a1, a6
+; RV64I-NEXT: or a6, a3, a6
+; RV64I-NEXT: .LBB10_3:
+; RV64I-NEXT: mv a3, a0
+; RV64I-NEXT: beqz a5, .LBB10_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a3, a6
+; RV64I-NEXT: .LBB10_5:
+; RV64I-NEXT: neg a7, a2
+; RV64I-NEXT: bltu a5, a4, .LBB10_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: li a2, 0
+; RV64I-NEXT: j .LBB10_8
+; RV64I-NEXT: .LBB10_7:
+; RV64I-NEXT: srl a2, a1, a2
+; RV64I-NEXT: .LBB10_8:
+; RV64I-NEXT: andi a6, a7, 127
+; RV64I-NEXT: bltu a6, a4, .LBB10_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: li a5, 0
+; RV64I-NEXT: sub a4, a6, a4
+; RV64I-NEXT: sll a0, a0, a4
+; RV64I-NEXT: bnez a6, .LBB10_11
+; RV64I-NEXT: j .LBB10_12
+; RV64I-NEXT: .LBB10_10:
+; RV64I-NEXT: sll a5, a0, a7
+; RV64I-NEXT: negw a4, a6
+; RV64I-NEXT: srl a0, a0, a4
+; RV64I-NEXT: sll a4, a1, a7
+; RV64I-NEXT: or a0, a0, a4
+; RV64I-NEXT: beqz a6, .LBB10_12
+; RV64I-NEXT: .LBB10_11:
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: .LBB10_12:
+; RV64I-NEXT: or a0, a3, a5
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: ret
+ %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b)
+ ret i128 %res
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
new file mode 100644
index 000000000000000..4ede69324289895
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -0,0 +1,10982 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -global-isel < %s | FileCheck %s -check-prefixes=RV64I
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s | FileCheck %s -check-prefixes=RV32I
+
+define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_4bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a0, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srlw a0, a0, a1
+; RV64I-NEXT: srliw a1, a0, 16
+; RV64I-NEXT: slli a3, a0, 48
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: srli a3, a3, 48
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_4bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: lbu a4, 0(a1)
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: slli a3, a0, 16
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: srli a3, a3, 16
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: sb a4, 3(a2)
+; RV32I-NEXT: ret
+ %src = load i32, ptr %src.ptr, align 1
+ %byteOff = load i32, ptr %byteOff.ptr, align 1
+ %bitOff = shl i32 %byteOff, 3
+ %res = lshr i32 %src, %bitOff
+ store i32 %res, ptr %dst, align 1
+ ret void
+}
+define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_4bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a0, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: sllw a0, a0, a1
+; RV64I-NEXT: srliw a1, a0, 16
+; RV64I-NEXT: slli a3, a0, 48
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: srli a3, a3, 48
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_4bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: lbu a4, 0(a1)
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: slli a3, a0, 16
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: srli a3, a3, 16
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: sb a4, 3(a2)
+; RV32I-NEXT: ret
+ %src = load i32, ptr %src.ptr, align 1
+ %byteOff = load i32, ptr %byteOff.ptr, align 1
+ %bitOff = shl i32 %byteOff, 3
+ %res = shl i32 %src, %bitOff
+ store i32 %res, ptr %dst, align 1
+ ret void
+}
+define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_4bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a0, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: sraw a0, a0, a1
+; RV64I-NEXT: srliw a1, a0, 16
+; RV64I-NEXT: slli a3, a0, 48
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: srli a3, a3, 48
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_4bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: lbu a4, 0(a1)
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: slli a3, a0, 16
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: srli a3, a3, 16
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: sb a4, 3(a2)
+; RV32I-NEXT: ret
+ %src = load i32, ptr %src.ptr, align 1
+ %byteOff = load i32, ptr %byteOff.ptr, align 1
+ %bitOff = shl i32 %byteOff, 3
+ %res = ashr i32 %src, %bitOff
+ store i32 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_8bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t2
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lui a4, 16
+; RV64I-NEXT: addi a4, a4, -1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t2, a5
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: srliw a3, a0, 16
+; RV64I-NEXT: and a5, a0, a4
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: and a4, a1, a4
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a5, 1(a2)
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_8bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or t1, a0, t1
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: slli a0, a4, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: slli a3, t1, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: bltu a1, a4, .LBB3_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a5, a3, a1
+; RV32I-NEXT: bnez a1, .LBB3_3
+; RV32I-NEXT: j .LBB3_4
+; RV32I-NEXT: .LBB3_2:
+; RV32I-NEXT: srl a5, a0, a1
+; RV32I-NEXT: neg a6, a1
+; RV32I-NEXT: sll a6, a3, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: beqz a1, .LBB3_4
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: mv a0, a5
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: bltu a1, a4, .LBB3_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB3_7
+; RV32I-NEXT: .LBB3_6:
+; RV32I-NEXT: srl a1, a3, a1
+; RV32I-NEXT: .LBB3_7:
+; RV32I-NEXT: srli a3, a0, 16
+; RV32I-NEXT: lui a4, 16
+; RV32I-NEXT: srli a5, a0, 24
+; RV32I-NEXT: srli a6, a1, 16
+; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: addi a4, a4, -1
+; RV32I-NEXT: and t0, a0, a4
+; RV32I-NEXT: and a4, a1, a4
+; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: sb a5, 3(a2)
+; RV32I-NEXT: sb a1, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb a6, 6(a2)
+; RV32I-NEXT: sb a7, 7(a2)
+; RV32I-NEXT: ret
+ %src = load i64, ptr %src.ptr, align 1
+ %byteOff = load i64, ptr %byteOff.ptr, align 1
+ %bitOff = shl i64 %byteOff, 3
+ %res = lshr i64 %src, %bitOff
+ store i64 %res, ptr %dst, align 1
+ ret void
+}
+define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_8bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t2
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lui a4, 16
+; RV64I-NEXT: addi a4, a4, -1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t2, a5
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: srliw a3, a0, 16
+; RV64I-NEXT: and a5, a0, a4
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: and a4, a1, a4
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a5, 1(a2)
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_8bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or a0, a0, t1
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: slli a4, a4, 16
+; RV32I-NEXT: or a4, a4, a3
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a3, a1, a6
+; RV32I-NEXT: slli a3, a3, 3
+; RV32I-NEXT: li a1, 32
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: bltu a3, a1, .LBB4_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sll a4, a4, a3
+; RV32I-NEXT: bnez a3, .LBB4_3
+; RV32I-NEXT: j .LBB4_4
+; RV32I-NEXT: .LBB4_2:
+; RV32I-NEXT: sll a1, a4, a3
+; RV32I-NEXT: neg a5, a3
+; RV32I-NEXT: srl a4, a4, a5
+; RV32I-NEXT: sll a5, a0, a3
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a3, .LBB4_4
+; RV32I-NEXT: .LBB4_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB4_4:
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: lui a4, 16
+; RV32I-NEXT: srli a5, a1, 24
+; RV32I-NEXT: srli a6, a0, 16
+; RV32I-NEXT: srli a7, a0, 24
+; RV32I-NEXT: addi a4, a4, -1
+; RV32I-NEXT: and t0, a1, a4
+; RV32I-NEXT: and a4, a0, a4
+; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: sb a5, 3(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb a6, 6(a2)
+; RV32I-NEXT: sb a7, 7(a2)
+; RV32I-NEXT: ret
+ %src = load i64, ptr %src.ptr, align 1
+ %byteOff = load i64, ptr %byteOff.ptr, align 1
+ %bitOff = shl i64 %byteOff, 3
+ %res = shl i64 %src, %bitOff
+ store i64 %res, ptr %dst, align 1
+ ret void
+}
+define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_8bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t2
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lui a4, 16
+; RV64I-NEXT: addi a4, a4, -1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t2, a5
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: sra a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: srliw a3, a0, 16
+; RV64I-NEXT: and a5, a0, a4
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: and a4, a1, a4
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a5, 1(a2)
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_8bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or t1, a0, t1
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: slli a0, a4, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: slli a3, t1, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: bltu a1, a4, .LBB5_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra a5, a3, a1
+; RV32I-NEXT: bnez a1, .LBB5_3
+; RV32I-NEXT: j .LBB5_4
+; RV32I-NEXT: .LBB5_2:
+; RV32I-NEXT: srl a5, a0, a1
+; RV32I-NEXT: neg a6, a1
+; RV32I-NEXT: sll a6, a3, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: beqz a1, .LBB5_4
+; RV32I-NEXT: .LBB5_3:
+; RV32I-NEXT: mv a0, a5
+; RV32I-NEXT: .LBB5_4:
+; RV32I-NEXT: bltu a1, a4, .LBB5_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srai a1, a3, 31
+; RV32I-NEXT: j .LBB5_7
+; RV32I-NEXT: .LBB5_6:
+; RV32I-NEXT: sra a1, a3, a1
+; RV32I-NEXT: .LBB5_7:
+; RV32I-NEXT: srli a3, a0, 16
+; RV32I-NEXT: lui a4, 16
+; RV32I-NEXT: srli a5, a0, 24
+; RV32I-NEXT: srli a6, a1, 16
+; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: addi a4, a4, -1
+; RV32I-NEXT: and t0, a0, a4
+; RV32I-NEXT: and a4, a1, a4
+; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: sb a5, 3(a2)
+; RV32I-NEXT: sb a1, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb a6, 6(a2)
+; RV32I-NEXT: sb a7, 7(a2)
+; RV32I-NEXT: ret
+ %src = load i64, ptr %src.ptr, align 1
+ %byteOff = load i64, ptr %byteOff.ptr, align 1
+ %bitOff = shl i64 %byteOff, 3
+ %res = ashr i64 %src, %bitOff
+ store i64 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_16bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: or a3, a6, a7
+; RV64I-NEXT: bltu a1, a4, .LBB6_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: srl a5, a3, a5
+; RV64I-NEXT: bnez a1, .LBB6_3
+; RV64I-NEXT: j .LBB6_4
+; RV64I-NEXT: .LBB6_2:
+; RV64I-NEXT: srl a5, a0, a1
+; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: sll a6, a3, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: beqz a1, .LBB6_4
+; RV64I-NEXT: .LBB6_3:
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB6_4:
+; RV64I-NEXT: bltu a1, a4, .LBB6_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: j .LBB6_7
+; RV64I-NEXT: .LBB6_6:
+; RV64I-NEXT: srl a1, a3, a1
+; RV64I-NEXT: .LBB6_7:
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: srliw a4, a0, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli t1, a1, 32
+; RV64I-NEXT: srliw t2, a1, 16
+; RV64I-NEXT: srliw t3, a1, 24
+; RV64I-NEXT: srli t4, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a0, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a0, a3, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a0, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a0, a1, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_16bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a4, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t4, 8(a0)
+; RV32I-NEXT: lbu t5, 9(a0)
+; RV32I-NEXT: lbu t6, 10(a0)
+; RV32I-NEXT: lbu s0, 11(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t3, a7, a6
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t6, a7, a6
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu s0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or t2, a0, t2
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or s1, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s0, a1, s0
+; RV32I-NEXT: li a7, 32
+; RV32I-NEXT: slli a1, a5, 8
+; RV32I-NEXT: slli a0, t0, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: or a6, t5, t4
+; RV32I-NEXT: or t0, t2, t6
+; RV32I-NEXT: or a5, s0, s1
+; RV32I-NEXT: slli a5, a5, 3
+; RV32I-NEXT: srl t2, a6, a5
+; RV32I-NEXT: neg t5, a5
+; RV32I-NEXT: sll t4, t0, t5
+; RV32I-NEXT: bltu a5, a7, .LBB6_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl t6, t0, a5
+; RV32I-NEXT: j .LBB6_3
+; RV32I-NEXT: .LBB6_2:
+; RV32I-NEXT: or t6, t2, t4
+; RV32I-NEXT: .LBB6_3:
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: or a3, a0, a4
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: beqz a5, .LBB6_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a0, t6
+; RV32I-NEXT: .LBB6_5:
+; RV32I-NEXT: or a4, t3, a1
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: bltu a5, a7, .LBB6_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: srl t6, a3, a5
+; RV32I-NEXT: j .LBB6_8
+; RV32I-NEXT: .LBB6_7:
+; RV32I-NEXT: srl a1, t0, a5
+; RV32I-NEXT: srl t1, a4, a5
+; RV32I-NEXT: sll t3, a3, t5
+; RV32I-NEXT: or t6, t1, t3
+; RV32I-NEXT: .LBB6_8:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz a5, .LBB6_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t3, t6
+; RV32I-NEXT: .LBB6_10:
+; RV32I-NEXT: sub s0, t1, a5
+; RV32I-NEXT: bltu a5, a7, .LBB6_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: bgeu s0, a7, .LBB6_14
+; RV32I-NEXT: .LBB6_12:
+; RV32I-NEXT: sll t5, a6, t5
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a6, s1
+; RV32I-NEXT: or s2, s1, t4
+; RV32I-NEXT: j .LBB6_15
+; RV32I-NEXT: .LBB6_13:
+; RV32I-NEXT: srl t6, a3, a5
+; RV32I-NEXT: bltu s0, a7, .LBB6_12
+; RV32I-NEXT: .LBB6_14:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: sll s2, a6, s0
+; RV32I-NEXT: .LBB6_15:
+; RV32I-NEXT: addi s1, a5, -64
+; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: beqz s0, .LBB6_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t4, s2
+; RV32I-NEXT: .LBB6_17:
+; RV32I-NEXT: bltu s1, a7, .LBB6_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: srl t2, t0, s1
+; RV32I-NEXT: bnez s1, .LBB6_20
+; RV32I-NEXT: j .LBB6_21
+; RV32I-NEXT: .LBB6_19:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, t0, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB6_21
+; RV32I-NEXT: .LBB6_20:
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: .LBB6_21:
+; RV32I-NEXT: bltu s1, a7, .LBB6_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: bltu a5, t1, .LBB6_24
+; RV32I-NEXT: j .LBB6_25
+; RV32I-NEXT: .LBB6_23:
+; RV32I-NEXT: srl a7, t0, a5
+; RV32I-NEXT: bgeu a5, t1, .LBB6_25
+; RV32I-NEXT: .LBB6_24:
+; RV32I-NEXT: or a6, t3, t5
+; RV32I-NEXT: or a7, t6, t4
+; RV32I-NEXT: .LBB6_25:
+; RV32I-NEXT: bnez a5, .LBB6_29
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: bltu a5, t1, .LBB6_28
+; RV32I-NEXT: .LBB6_27:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB6_28:
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: lui a6, 16
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a0, 16
+; RV32I-NEXT: srli t3, a0, 24
+; RV32I-NEXT: srli t4, a1, 16
+; RV32I-NEXT: srli t5, a1, 24
+; RV32I-NEXT: addi a6, a6, -1
+; RV32I-NEXT: and t6, a4, a6
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a5, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, a3, a6
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a3, a0, a6
+; RV32I-NEXT: and a4, a1, a6
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB6_29:
+; RV32I-NEXT: mv a4, a6
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: bgeu a5, t1, .LBB6_27
+; RV32I-NEXT: j .LBB6_28
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: or a3, a6, a7
+; RV64I-NEXT: bltu a1, a4, .LBB7_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: srl a5, a3, a5
+; RV64I-NEXT: bnez a1, .LBB7_3
+; RV64I-NEXT: j .LBB7_4
+; RV64I-NEXT: .LBB7_2:
+; RV64I-NEXT: srl a5, a0, a1
+; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: sll a6, a3, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: beqz a1, .LBB7_4
+; RV64I-NEXT: .LBB7_3:
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB7_4:
+; RV64I-NEXT: bltu a1, a4, .LBB7_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: j .LBB7_7
+; RV64I-NEXT: .LBB7_6:
+; RV64I-NEXT: srl a1, a3, a1
+; RV64I-NEXT: .LBB7_7:
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: srliw a4, a0, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli t1, a1, 32
+; RV64I-NEXT: srliw t2, a1, 16
+; RV64I-NEXT: srliw t3, a1, 24
+; RV64I-NEXT: srli t4, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a0, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a0, a3, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a0, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a0, a1, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a4, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t4, 8(a0)
+; RV32I-NEXT: lbu t5, 9(a0)
+; RV32I-NEXT: lbu t6, 10(a0)
+; RV32I-NEXT: lbu s0, 11(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t3, a7, a6
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t6, a7, a6
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu s0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or t2, a0, t2
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or s1, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s0, a1, s0
+; RV32I-NEXT: li a7, 32
+; RV32I-NEXT: slli a1, a5, 8
+; RV32I-NEXT: slli a0, t0, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: or a6, t5, t4
+; RV32I-NEXT: or t0, t2, t6
+; RV32I-NEXT: or a5, s0, s1
+; RV32I-NEXT: slli a5, a5, 5
+; RV32I-NEXT: srl t2, a6, a5
+; RV32I-NEXT: neg t5, a5
+; RV32I-NEXT: sll t4, t0, t5
+; RV32I-NEXT: bltu a5, a7, .LBB7_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl t6, t0, a5
+; RV32I-NEXT: j .LBB7_3
+; RV32I-NEXT: .LBB7_2:
+; RV32I-NEXT: or t6, t2, t4
+; RV32I-NEXT: .LBB7_3:
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: or a3, a0, a4
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: beqz a5, .LBB7_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a0, t6
+; RV32I-NEXT: .LBB7_5:
+; RV32I-NEXT: or a4, t3, a1
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: bltu a5, a7, .LBB7_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: srl t6, a3, a5
+; RV32I-NEXT: j .LBB7_8
+; RV32I-NEXT: .LBB7_7:
+; RV32I-NEXT: srl a1, t0, a5
+; RV32I-NEXT: srl t1, a4, a5
+; RV32I-NEXT: sll t3, a3, t5
+; RV32I-NEXT: or t6, t1, t3
+; RV32I-NEXT: .LBB7_8:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz a5, .LBB7_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t3, t6
+; RV32I-NEXT: .LBB7_10:
+; RV32I-NEXT: sub s0, t1, a5
+; RV32I-NEXT: bltu a5, a7, .LBB7_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: bgeu s0, a7, .LBB7_14
+; RV32I-NEXT: .LBB7_12:
+; RV32I-NEXT: sll t5, a6, t5
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a6, s1
+; RV32I-NEXT: or s2, s1, t4
+; RV32I-NEXT: j .LBB7_15
+; RV32I-NEXT: .LBB7_13:
+; RV32I-NEXT: srl t6, a3, a5
+; RV32I-NEXT: bltu s0, a7, .LBB7_12
+; RV32I-NEXT: .LBB7_14:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: sll s2, a6, s0
+; RV32I-NEXT: .LBB7_15:
+; RV32I-NEXT: addi s1, a5, -64
+; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: beqz s0, .LBB7_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t4, s2
+; RV32I-NEXT: .LBB7_17:
+; RV32I-NEXT: bltu s1, a7, .LBB7_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: srl t2, t0, s1
+; RV32I-NEXT: bnez s1, .LBB7_20
+; RV32I-NEXT: j .LBB7_21
+; RV32I-NEXT: .LBB7_19:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, t0, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB7_21
+; RV32I-NEXT: .LBB7_20:
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: .LBB7_21:
+; RV32I-NEXT: bltu s1, a7, .LBB7_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: bltu a5, t1, .LBB7_24
+; RV32I-NEXT: j .LBB7_25
+; RV32I-NEXT: .LBB7_23:
+; RV32I-NEXT: srl a7, t0, a5
+; RV32I-NEXT: bgeu a5, t1, .LBB7_25
+; RV32I-NEXT: .LBB7_24:
+; RV32I-NEXT: or a6, t3, t5
+; RV32I-NEXT: or a7, t6, t4
+; RV32I-NEXT: .LBB7_25:
+; RV32I-NEXT: bnez a5, .LBB7_29
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: bltu a5, t1, .LBB7_28
+; RV32I-NEXT: .LBB7_27:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB7_28:
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: lui a6, 16
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a0, 16
+; RV32I-NEXT: srli t3, a0, 24
+; RV32I-NEXT: srli t4, a1, 16
+; RV32I-NEXT: srli t5, a1, 24
+; RV32I-NEXT: addi a6, a6, -1
+; RV32I-NEXT: and t6, a4, a6
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a5, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, a3, a6
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a3, a0, a6
+; RV32I-NEXT: and a4, a1, a6
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB7_29:
+; RV32I-NEXT: mv a4, a6
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: bgeu a5, t1, .LBB7_27
+; RV32I-NEXT: j .LBB7_28
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_16bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a4, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a3, a1, 3
+; RV64I-NEXT: li a5, 64
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: bltu a3, a5, .LBB8_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sub a5, a3, a5
+; RV64I-NEXT: sll a4, a4, a5
+; RV64I-NEXT: bnez a3, .LBB8_3
+; RV64I-NEXT: j .LBB8_4
+; RV64I-NEXT: .LBB8_2:
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: srl a4, a4, a5
+; RV64I-NEXT: sll a5, a0, a3
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: beqz a3, .LBB8_4
+; RV64I-NEXT: .LBB8_3:
+; RV64I-NEXT: mv a0, a4
+; RV64I-NEXT: .LBB8_4:
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: srliw a4, a1, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a1, 24
+; RV64I-NEXT: srli a7, a1, 48
+; RV64I-NEXT: srli t0, a1, 56
+; RV64I-NEXT: srli t1, a0, 32
+; RV64I-NEXT: srliw t2, a0, 16
+; RV64I-NEXT: srliw t3, a0, 24
+; RV64I-NEXT: srli t4, a0, 48
+; RV64I-NEXT: srli t5, a0, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a1, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a1, a3, a5
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a1, a0, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_16bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, a6
+; RV32I-NEXT: li a6, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: li t1, 32
+; RV32I-NEXT: slli a4, a4, 16
+; RV32I-NEXT: slli t2, t0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t0, a4, a3
+; RV32I-NEXT: or a4, t2, a5
+; RV32I-NEXT: or a5, a1, a7
+; RV32I-NEXT: slli a5, a5, 3
+; RV32I-NEXT: neg t3, a5
+; RV32I-NEXT: srl t4, t0, t3
+; RV32I-NEXT: sll t2, a4, a5
+; RV32I-NEXT: bltu a5, t1, .LBB8_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sll t5, t0, a5
+; RV32I-NEXT: j .LBB8_3
+; RV32I-NEXT: .LBB8_2:
+; RV32I-NEXT: sll a1, t0, a5
+; RV32I-NEXT: or t5, t4, t2
+; RV32I-NEXT: .LBB8_3:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu s1, 15(a0)
+; RV32I-NEXT: sub a7, a6, a5
+; RV32I-NEXT: mv a3, a4
+; RV32I-NEXT: beqz a5, .LBB8_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, t5
+; RV32I-NEXT: .LBB8_5:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu s0, 13(a0)
+; RV32I-NEXT: lbu t6, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: bltu a7, t1, .LBB8_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl s4, a4, a7
+; RV32I-NEXT: j .LBB8_8
+; RV32I-NEXT: .LBB8_7:
+; RV32I-NEXT: neg s4, a7
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or s4, t4, s4
+; RV32I-NEXT: .LBB8_8:
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: lbu s6, 8(a0)
+; RV32I-NEXT: lbu s5, 12(a0)
+; RV32I-NEXT: or s3, s3, t5
+; RV32I-NEXT: slli t5, s0, 8
+; RV32I-NEXT: or s1, s1, t6
+; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: beqz a7, .LBB8_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t4, s4
+; RV32I-NEXT: .LBB8_10:
+; RV32I-NEXT: or a0, s2, s6
+; RV32I-NEXT: slli s0, s3, 16
+; RV32I-NEXT: or t6, t5, s5
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: bltu a7, t1, .LBB8_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB8_13
+; RV32I-NEXT: .LBB8_12:
+; RV32I-NEXT: srl t5, a4, t3
+; RV32I-NEXT: .LBB8_13:
+; RV32I-NEXT: or a7, s0, a0
+; RV32I-NEXT: or a0, s1, t6
+; RV32I-NEXT: bltu a5, t1, .LBB8_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s1, a7, a5
+; RV32I-NEXT: j .LBB8_16
+; RV32I-NEXT: .LBB8_15:
+; RV32I-NEXT: sll t6, a7, a5
+; RV32I-NEXT: srl t3, a7, t3
+; RV32I-NEXT: sll s0, a0, a5
+; RV32I-NEXT: or s1, t3, s0
+; RV32I-NEXT: .LBB8_16:
+; RV32I-NEXT: addi s0, a5, -64
+; RV32I-NEXT: mv t3, a0
+; RV32I-NEXT: beqz a5, .LBB8_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, s1
+; RV32I-NEXT: .LBB8_18:
+; RV32I-NEXT: bltu s0, t1, .LBB8_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li t1, 0
+; RV32I-NEXT: sll t0, t0, s0
+; RV32I-NEXT: bnez s0, .LBB8_21
+; RV32I-NEXT: j .LBB8_22
+; RV32I-NEXT: .LBB8_20:
+; RV32I-NEXT: sll t1, t0, a5
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl t0, t0, s1
+; RV32I-NEXT: or t0, t0, t2
+; RV32I-NEXT: beqz s0, .LBB8_22
+; RV32I-NEXT: .LBB8_21:
+; RV32I-NEXT: mv a4, t0
+; RV32I-NEXT: .LBB8_22:
+; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: bltu a5, a6, .LBB8_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: bnez a5, .LBB8_25
+; RV32I-NEXT: j .LBB8_26
+; RV32I-NEXT: .LBB8_24:
+; RV32I-NEXT: or t1, t4, t6
+; RV32I-NEXT: or a4, t5, t3
+; RV32I-NEXT: beqz a5, .LBB8_26
+; RV32I-NEXT: .LBB8_25:
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB8_26:
+; RV32I-NEXT: srli a4, a1, 16
+; RV32I-NEXT: lui a5, 16
+; RV32I-NEXT: srli a6, a1, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a7, 16
+; RV32I-NEXT: srli t3, a7, 24
+; RV32I-NEXT: srli t4, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: addi a5, a5, -1
+; RV32I-NEXT: and t6, a1, a5
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: and a1, a3, a5
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a1, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a1, a7, a5
+; RV32I-NEXT: and a5, a0, a5
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: sb a5, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a4, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a3, a1, 5
+; RV64I-NEXT: li a5, 64
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: bltu a3, a5, .LBB9_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sub a5, a3, a5
+; RV64I-NEXT: sll a4, a4, a5
+; RV64I-NEXT: bnez a3, .LBB9_3
+; RV64I-NEXT: j .LBB9_4
+; RV64I-NEXT: .LBB9_2:
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: srl a4, a4, a5
+; RV64I-NEXT: sll a5, a0, a3
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: beqz a3, .LBB9_4
+; RV64I-NEXT: .LBB9_3:
+; RV64I-NEXT: mv a0, a4
+; RV64I-NEXT: .LBB9_4:
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: srliw a4, a1, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a1, 24
+; RV64I-NEXT: srli a7, a1, 48
+; RV64I-NEXT: srli t0, a1, 56
+; RV64I-NEXT: srli t1, a0, 32
+; RV64I-NEXT: srliw t2, a0, 16
+; RV64I-NEXT: srliw t3, a0, 24
+; RV64I-NEXT: srli t4, a0, 48
+; RV64I-NEXT: srli t5, a0, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a1, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a1, a3, a5
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a1, a0, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, a6
+; RV32I-NEXT: li a6, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: li t1, 32
+; RV32I-NEXT: slli a4, a4, 16
+; RV32I-NEXT: slli t2, t0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t0, a4, a3
+; RV32I-NEXT: or a4, t2, a5
+; RV32I-NEXT: or a5, a1, a7
+; RV32I-NEXT: slli a5, a5, 5
+; RV32I-NEXT: neg t3, a5
+; RV32I-NEXT: srl t4, t0, t3
+; RV32I-NEXT: sll t2, a4, a5
+; RV32I-NEXT: bltu a5, t1, .LBB9_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sll t5, t0, a5
+; RV32I-NEXT: j .LBB9_3
+; RV32I-NEXT: .LBB9_2:
+; RV32I-NEXT: sll a1, t0, a5
+; RV32I-NEXT: or t5, t4, t2
+; RV32I-NEXT: .LBB9_3:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu s1, 15(a0)
+; RV32I-NEXT: sub a7, a6, a5
+; RV32I-NEXT: mv a3, a4
+; RV32I-NEXT: beqz a5, .LBB9_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, t5
+; RV32I-NEXT: .LBB9_5:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu s0, 13(a0)
+; RV32I-NEXT: lbu t6, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: bltu a7, t1, .LBB9_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl s4, a4, a7
+; RV32I-NEXT: j .LBB9_8
+; RV32I-NEXT: .LBB9_7:
+; RV32I-NEXT: neg s4, a7
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or s4, t4, s4
+; RV32I-NEXT: .LBB9_8:
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: lbu s6, 8(a0)
+; RV32I-NEXT: lbu s5, 12(a0)
+; RV32I-NEXT: or s3, s3, t5
+; RV32I-NEXT: slli t5, s0, 8
+; RV32I-NEXT: or s1, s1, t6
+; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: beqz a7, .LBB9_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t4, s4
+; RV32I-NEXT: .LBB9_10:
+; RV32I-NEXT: or a0, s2, s6
+; RV32I-NEXT: slli s0, s3, 16
+; RV32I-NEXT: or t6, t5, s5
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: bltu a7, t1, .LBB9_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB9_13
+; RV32I-NEXT: .LBB9_12:
+; RV32I-NEXT: srl t5, a4, t3
+; RV32I-NEXT: .LBB9_13:
+; RV32I-NEXT: or a7, s0, a0
+; RV32I-NEXT: or a0, s1, t6
+; RV32I-NEXT: bltu a5, t1, .LBB9_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s1, a7, a5
+; RV32I-NEXT: j .LBB9_16
+; RV32I-NEXT: .LBB9_15:
+; RV32I-NEXT: sll t6, a7, a5
+; RV32I-NEXT: srl t3, a7, t3
+; RV32I-NEXT: sll s0, a0, a5
+; RV32I-NEXT: or s1, t3, s0
+; RV32I-NEXT: .LBB9_16:
+; RV32I-NEXT: addi s0, a5, -64
+; RV32I-NEXT: mv t3, a0
+; RV32I-NEXT: beqz a5, .LBB9_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, s1
+; RV32I-NEXT: .LBB9_18:
+; RV32I-NEXT: bltu s0, t1, .LBB9_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li t1, 0
+; RV32I-NEXT: sll t0, t0, s0
+; RV32I-NEXT: bnez s0, .LBB9_21
+; RV32I-NEXT: j .LBB9_22
+; RV32I-NEXT: .LBB9_20:
+; RV32I-NEXT: sll t1, t0, a5
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl t0, t0, s1
+; RV32I-NEXT: or t0, t0, t2
+; RV32I-NEXT: beqz s0, .LBB9_22
+; RV32I-NEXT: .LBB9_21:
+; RV32I-NEXT: mv a4, t0
+; RV32I-NEXT: .LBB9_22:
+; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: bltu a5, a6, .LBB9_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: bnez a5, .LBB9_25
+; RV32I-NEXT: j .LBB9_26
+; RV32I-NEXT: .LBB9_24:
+; RV32I-NEXT: or t1, t4, t6
+; RV32I-NEXT: or a4, t5, t3
+; RV32I-NEXT: beqz a5, .LBB9_26
+; RV32I-NEXT: .LBB9_25:
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB9_26:
+; RV32I-NEXT: srli a4, a1, 16
+; RV32I-NEXT: lui a5, 16
+; RV32I-NEXT: srli a6, a1, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a7, 16
+; RV32I-NEXT: srli t3, a7, 24
+; RV32I-NEXT: srli t4, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: addi a5, a5, -1
+; RV32I-NEXT: and t6, a1, a5
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: and a1, a3, a5
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a1, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a1, a7, a5
+; RV32I-NEXT: and a5, a0, a5
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: sb a5, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+
+define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_16bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: or a3, a6, a7
+; RV64I-NEXT: bltu a1, a4, .LBB10_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: sra a5, a3, a5
+; RV64I-NEXT: bnez a1, .LBB10_3
+; RV64I-NEXT: j .LBB10_4
+; RV64I-NEXT: .LBB10_2:
+; RV64I-NEXT: srl a5, a0, a1
+; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: sll a6, a3, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: beqz a1, .LBB10_4
+; RV64I-NEXT: .LBB10_3:
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB10_4:
+; RV64I-NEXT: bltu a1, a4, .LBB10_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: srai a1, a3, 63
+; RV64I-NEXT: j .LBB10_7
+; RV64I-NEXT: .LBB10_6:
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: .LBB10_7:
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: srliw a4, a0, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli t1, a1, 32
+; RV64I-NEXT: srliw t2, a1, 16
+; RV64I-NEXT: srliw t3, a1, 24
+; RV64I-NEXT: srli t4, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a0, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a0, a3, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a0, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a0, a1, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_16bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a4, 4(a0)
+; RV32I-NEXT: lbu t2, 5(a0)
+; RV32I-NEXT: lbu t0, 6(a0)
+; RV32I-NEXT: lbu t1, 7(a0)
+; RV32I-NEXT: lbu t4, 8(a0)
+; RV32I-NEXT: lbu t5, 9(a0)
+; RV32I-NEXT: lbu t6, 10(a0)
+; RV32I-NEXT: lbu s0, 11(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t3, a7, a6
+; RV32I-NEXT: or t1, t1, t0
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t6, 1(a1)
+; RV32I-NEXT: lbu s0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or s1, a0, t0
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: or t6, t6, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s0, a1, s0
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: slli a1, a5, 8
+; RV32I-NEXT: slli a0, t2, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or a5, s1, a6
+; RV32I-NEXT: or a6, s0, t6
+; RV32I-NEXT: slli a6, a6, 3
+; RV32I-NEXT: srl t2, a7, a6
+; RV32I-NEXT: neg t6, a6
+; RV32I-NEXT: sll t4, a5, t6
+; RV32I-NEXT: bltu a6, t0, .LBB10_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t5, a5, a6
+; RV32I-NEXT: j .LBB10_3
+; RV32I-NEXT: .LBB10_2:
+; RV32I-NEXT: or t5, t2, t4
+; RV32I-NEXT: .LBB10_3:
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: or a3, a0, a4
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: beqz a6, .LBB10_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a0, t5
+; RV32I-NEXT: .LBB10_5:
+; RV32I-NEXT: or a4, t3, a1
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: bltu a6, t0, .LBB10_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai a1, a5, 31
+; RV32I-NEXT: srl t5, a3, a6
+; RV32I-NEXT: j .LBB10_8
+; RV32I-NEXT: .LBB10_7:
+; RV32I-NEXT: sra a1, a5, a6
+; RV32I-NEXT: srl t1, a4, a6
+; RV32I-NEXT: sll t3, a3, t6
+; RV32I-NEXT: or t5, t1, t3
+; RV32I-NEXT: .LBB10_8:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz a6, .LBB10_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: .LBB10_10:
+; RV32I-NEXT: sub s0, t1, a6
+; RV32I-NEXT: bltu a6, t0, .LBB10_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: bgeu s0, t0, .LBB10_14
+; RV32I-NEXT: .LBB10_12:
+; RV32I-NEXT: sll t6, a7, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a7, s1
+; RV32I-NEXT: or s2, s1, t4
+; RV32I-NEXT: j .LBB10_15
+; RV32I-NEXT: .LBB10_13:
+; RV32I-NEXT: srl t5, a3, a6
+; RV32I-NEXT: bltu s0, t0, .LBB10_12
+; RV32I-NEXT: .LBB10_14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a7, s0
+; RV32I-NEXT: .LBB10_15:
+; RV32I-NEXT: addi s1, a6, -64
+; RV32I-NEXT: mv t4, a5
+; RV32I-NEXT: beqz s0, .LBB10_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t4, s2
+; RV32I-NEXT: .LBB10_17:
+; RV32I-NEXT: bltu s1, t0, .LBB10_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: sra t2, a5, s1
+; RV32I-NEXT: bnez s1, .LBB10_20
+; RV32I-NEXT: j .LBB10_21
+; RV32I-NEXT: .LBB10_19:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, a5, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB10_21
+; RV32I-NEXT: .LBB10_20:
+; RV32I-NEXT: mv a7, t2
+; RV32I-NEXT: .LBB10_21:
+; RV32I-NEXT: bltu s1, t0, .LBB10_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: srai t0, a5, 31
+; RV32I-NEXT: bltu a6, t1, .LBB10_24
+; RV32I-NEXT: j .LBB10_25
+; RV32I-NEXT: .LBB10_23:
+; RV32I-NEXT: sra t0, a5, a6
+; RV32I-NEXT: bgeu a6, t1, .LBB10_25
+; RV32I-NEXT: .LBB10_24:
+; RV32I-NEXT: or a7, t3, t6
+; RV32I-NEXT: or t0, t5, t4
+; RV32I-NEXT: .LBB10_25:
+; RV32I-NEXT: bnez a6, .LBB10_29
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: bltu a6, t1, .LBB10_28
+; RV32I-NEXT: .LBB10_27:
+; RV32I-NEXT: srai a0, a5, 31
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB10_28:
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: lui a6, 16
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a0, 16
+; RV32I-NEXT: srli t3, a0, 24
+; RV32I-NEXT: srli t4, a1, 16
+; RV32I-NEXT: srli t5, a1, 24
+; RV32I-NEXT: addi a6, a6, -1
+; RV32I-NEXT: and t6, a4, a6
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a5, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, a3, a6
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a3, a0, a6
+; RV32I-NEXT: and a4, a1, a6
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB10_29:
+; RV32I-NEXT: mv a4, a7
+; RV32I-NEXT: mv a3, t0
+; RV32I-NEXT: bgeu a6, t1, .LBB10_27
+; RV32I-NEXT: j .LBB10_28
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: or a3, a6, a7
+; RV64I-NEXT: bltu a1, a4, .LBB11_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: sra a5, a3, a5
+; RV64I-NEXT: bnez a1, .LBB11_3
+; RV64I-NEXT: j .LBB11_4
+; RV64I-NEXT: .LBB11_2:
+; RV64I-NEXT: srl a5, a0, a1
+; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: sll a6, a3, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: beqz a1, .LBB11_4
+; RV64I-NEXT: .LBB11_3:
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB11_4:
+; RV64I-NEXT: bltu a1, a4, .LBB11_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: srai a1, a3, 63
+; RV64I-NEXT: j .LBB11_7
+; RV64I-NEXT: .LBB11_6:
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: .LBB11_7:
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: srliw a4, a0, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli t1, a1, 32
+; RV64I-NEXT: srliw t2, a1, 16
+; RV64I-NEXT: srliw t3, a1, 24
+; RV64I-NEXT: srli t4, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a0, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a0, a3, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a0, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a0, a1, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a4, 4(a0)
+; RV32I-NEXT: lbu t2, 5(a0)
+; RV32I-NEXT: lbu t0, 6(a0)
+; RV32I-NEXT: lbu t1, 7(a0)
+; RV32I-NEXT: lbu t4, 8(a0)
+; RV32I-NEXT: lbu t5, 9(a0)
+; RV32I-NEXT: lbu t6, 10(a0)
+; RV32I-NEXT: lbu s0, 11(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t3, a7, a6
+; RV32I-NEXT: or t1, t1, t0
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t6, 1(a1)
+; RV32I-NEXT: lbu s0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or s1, a0, t0
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: or t6, t6, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s0, a1, s0
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: slli a1, a5, 8
+; RV32I-NEXT: slli a0, t2, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or a5, s1, a6
+; RV32I-NEXT: or a6, s0, t6
+; RV32I-NEXT: slli a6, a6, 5
+; RV32I-NEXT: srl t2, a7, a6
+; RV32I-NEXT: neg t6, a6
+; RV32I-NEXT: sll t4, a5, t6
+; RV32I-NEXT: bltu a6, t0, .LBB11_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t5, a5, a6
+; RV32I-NEXT: j .LBB11_3
+; RV32I-NEXT: .LBB11_2:
+; RV32I-NEXT: or t5, t2, t4
+; RV32I-NEXT: .LBB11_3:
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: or a3, a0, a4
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: beqz a6, .LBB11_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a0, t5
+; RV32I-NEXT: .LBB11_5:
+; RV32I-NEXT: or a4, t3, a1
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: bltu a6, t0, .LBB11_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai a1, a5, 31
+; RV32I-NEXT: srl t5, a3, a6
+; RV32I-NEXT: j .LBB11_8
+; RV32I-NEXT: .LBB11_7:
+; RV32I-NEXT: sra a1, a5, a6
+; RV32I-NEXT: srl t1, a4, a6
+; RV32I-NEXT: sll t3, a3, t6
+; RV32I-NEXT: or t5, t1, t3
+; RV32I-NEXT: .LBB11_8:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz a6, .LBB11_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: .LBB11_10:
+; RV32I-NEXT: sub s0, t1, a6
+; RV32I-NEXT: bltu a6, t0, .LBB11_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: bgeu s0, t0, .LBB11_14
+; RV32I-NEXT: .LBB11_12:
+; RV32I-NEXT: sll t6, a7, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a7, s1
+; RV32I-NEXT: or s2, s1, t4
+; RV32I-NEXT: j .LBB11_15
+; RV32I-NEXT: .LBB11_13:
+; RV32I-NEXT: srl t5, a3, a6
+; RV32I-NEXT: bltu s0, t0, .LBB11_12
+; RV32I-NEXT: .LBB11_14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a7, s0
+; RV32I-NEXT: .LBB11_15:
+; RV32I-NEXT: addi s1, a6, -64
+; RV32I-NEXT: mv t4, a5
+; RV32I-NEXT: beqz s0, .LBB11_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t4, s2
+; RV32I-NEXT: .LBB11_17:
+; RV32I-NEXT: bltu s1, t0, .LBB11_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: sra t2, a5, s1
+; RV32I-NEXT: bnez s1, .LBB11_20
+; RV32I-NEXT: j .LBB11_21
+; RV32I-NEXT: .LBB11_19:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, a5, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB11_21
+; RV32I-NEXT: .LBB11_20:
+; RV32I-NEXT: mv a7, t2
+; RV32I-NEXT: .LBB11_21:
+; RV32I-NEXT: bltu s1, t0, .LBB11_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: srai t0, a5, 31
+; RV32I-NEXT: bltu a6, t1, .LBB11_24
+; RV32I-NEXT: j .LBB11_25
+; RV32I-NEXT: .LBB11_23:
+; RV32I-NEXT: sra t0, a5, a6
+; RV32I-NEXT: bgeu a6, t1, .LBB11_25
+; RV32I-NEXT: .LBB11_24:
+; RV32I-NEXT: or a7, t3, t6
+; RV32I-NEXT: or t0, t5, t4
+; RV32I-NEXT: .LBB11_25:
+; RV32I-NEXT: bnez a6, .LBB11_29
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: bltu a6, t1, .LBB11_28
+; RV32I-NEXT: .LBB11_27:
+; RV32I-NEXT: srai a0, a5, 31
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB11_28:
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: lui a6, 16
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a0, 16
+; RV32I-NEXT: srli t3, a0, 24
+; RV32I-NEXT: srli t4, a1, 16
+; RV32I-NEXT: srli t5, a1, 24
+; RV32I-NEXT: addi a6, a6, -1
+; RV32I-NEXT: and t6, a4, a6
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a5, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, a3, a6
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a3, a0, a6
+; RV32I-NEXT: and a4, a1, a6
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB11_29:
+; RV32I-NEXT: mv a4, a7
+; RV32I-NEXT: mv a3, t0
+; RV32I-NEXT: bgeu a6, t1, .LBB11_27
+; RV32I-NEXT: j .LBB11_28
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or t0, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t3, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li a7, 64
+; RV64I-NEXT: slli t4, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a6, t1, t0
+; RV64I-NEXT: or t0, t5, t3
+; RV64I-NEXT: or a5, s0, t6
+; RV64I-NEXT: slli a5, a5, 3
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sll t3, t0, t5
+; RV64I-NEXT: bltu a5, a7, .LBB12_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: srl t6, t0, t1
+; RV64I-NEXT: j .LBB12_3
+; RV64I-NEXT: .LBB12_2:
+; RV64I-NEXT: srl t6, a6, a5
+; RV64I-NEXT: or t6, t6, t3
+; RV64I-NEXT: .LBB12_3:
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli t4, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a6
+; RV64I-NEXT: beqz a5, .LBB12_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB12_5:
+; RV64I-NEXT: or a4, t4, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a5, a7, .LBB12_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: srl t4, a3, t1
+; RV64I-NEXT: j .LBB12_8
+; RV64I-NEXT: .LBB12_7:
+; RV64I-NEXT: srl a0, t0, a5
+; RV64I-NEXT: srl t1, a4, a5
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t4, t1, t2
+; RV64I-NEXT: .LBB12_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a5, .LBB12_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t4
+; RV64I-NEXT: .LBB12_10:
+; RV64I-NEXT: sub t6, t1, a5
+; RV64I-NEXT: bltu a5, a7, .LBB12_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t4, 0
+; RV64I-NEXT: bgeu t6, a7, .LBB12_14
+; RV64I-NEXT: .LBB12_12:
+; RV64I-NEXT: sll t5, a6, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a6, s0
+; RV64I-NEXT: or s1, s0, t3
+; RV64I-NEXT: j .LBB12_15
+; RV64I-NEXT: .LBB12_13:
+; RV64I-NEXT: srl t4, a3, a5
+; RV64I-NEXT: bltu t6, a7, .LBB12_12
+; RV64I-NEXT: .LBB12_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: sll s1, a6, t3
+; RV64I-NEXT: .LBB12_15:
+; RV64I-NEXT: sub s0, a5, t1
+; RV64I-NEXT: mv t3, t0
+; RV64I-NEXT: beqz t6, .LBB12_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t3, s1
+; RV64I-NEXT: .LBB12_17:
+; RV64I-NEXT: bltu s0, a7, .LBB12_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: srl t6, t0, t6
+; RV64I-NEXT: bnez s0, .LBB12_20
+; RV64I-NEXT: j .LBB12_21
+; RV64I-NEXT: .LBB12_19:
+; RV64I-NEXT: srl t6, a6, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, t0, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB12_21
+; RV64I-NEXT: .LBB12_20:
+; RV64I-NEXT: mv a6, t6
+; RV64I-NEXT: .LBB12_21:
+; RV64I-NEXT: bltu s0, a7, .LBB12_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: li a7, 0
+; RV64I-NEXT: bltu a5, t1, .LBB12_24
+; RV64I-NEXT: j .LBB12_25
+; RV64I-NEXT: .LBB12_23:
+; RV64I-NEXT: srl a7, t0, s0
+; RV64I-NEXT: bgeu a5, t1, .LBB12_25
+; RV64I-NEXT: .LBB12_24:
+; RV64I-NEXT: or a6, t2, t5
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: .LBB12_25:
+; RV64I-NEXT: bnez a5, .LBB12_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a5, t1, .LBB12_28
+; RV64I-NEXT: .LBB12_27:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: .LBB12_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB12_29:
+; RV64I-NEXT: mv a4, a6
+; RV64I-NEXT: mv a3, a7
+; RV64I-NEXT: bgeu a5, t1, .LBB12_27
+; RV64I-NEXT: j .LBB12_28
+;
+; RV32I-LABEL: lshr_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a6, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a4, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t4, t4, t3
+; RV32I-NEXT: or t5, t6, t5
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t3, 32
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a1, a4, 16
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a4, t1, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: or t1, t5, t4
+; RV32I-NEXT: or t5, a4, a7
+; RV32I-NEXT: or a4, t2, t0
+; RV32I-NEXT: slli a4, a4, 3
+; RV32I-NEXT: srl s0, t1, a4
+; RV32I-NEXT: neg s6, a4
+; RV32I-NEXT: sll t4, t5, s6
+; RV32I-NEXT: bltu a4, t3, .LBB12_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, t5, a4
+; RV32I-NEXT: j .LBB12_3
+; RV32I-NEXT: .LBB12_2:
+; RV32I-NEXT: or a7, s0, t4
+; RV32I-NEXT: .LBB12_3:
+; RV32I-NEXT: or t0, a6, a3
+; RV32I-NEXT: or a6, a1, a5
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: beqz a4, .LBB12_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a1, a7
+; RV32I-NEXT: .LBB12_5:
+; RV32I-NEXT: srl a3, t0, a4
+; RV32I-NEXT: sll a5, a6, s6
+; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t3, .LBB12_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: srl a3, a6, a4
+; RV32I-NEXT: j .LBB12_8
+; RV32I-NEXT: .LBB12_7:
+; RV32I-NEXT: srl ra, t5, a4
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: .LBB12_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv a7, t0
+; RV32I-NEXT: beqz a4, .LBB12_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a7, a3
+; RV32I-NEXT: .LBB12_10:
+; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s7, t6, a4
+; RV32I-NEXT: bltu a4, t3, .LBB12_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: j .LBB12_13
+; RV32I-NEXT: .LBB12_12:
+; RV32I-NEXT: srl a5, a6, a4
+; RV32I-NEXT: .LBB12_13:
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t3, .LBB12_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: sll a3, t1, s7
+; RV32I-NEXT: j .LBB12_16
+; RV32I-NEXT: .LBB12_15:
+; RV32I-NEXT: sll t2, t1, s6
+; RV32I-NEXT: srl a3, t1, s10
+; RV32I-NEXT: or a3, a3, t4
+; RV32I-NEXT: .LBB12_16:
+; RV32I-NEXT: addi s9, a4, -64
+; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: beqz s7, .LBB12_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t4, a3
+; RV32I-NEXT: .LBB12_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB12_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: srl s2, t5, s9
+; RV32I-NEXT: j .LBB12_21
+; RV32I-NEXT: .LBB12_20:
+; RV32I-NEXT: sll a3, t5, s11
+; RV32I-NEXT: or s2, s0, a3
+; RV32I-NEXT: .LBB12_21:
+; RV32I-NEXT: lbu s1, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: beqz s9, .LBB12_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv s0, s2
+; RV32I-NEXT: .LBB12_23:
+; RV32I-NEXT: lbu s4, 9(a0)
+; RV32I-NEXT: lbu s2, 10(a0)
+; RV32I-NEXT: lbu s5, 13(a0)
+; RV32I-NEXT: lbu s8, 14(a0)
+; RV32I-NEXT: slli s3, s1, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB12_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: j .LBB12_26
+; RV32I-NEXT: .LBB12_25:
+; RV32I-NEXT: srl s1, t5, a4
+; RV32I-NEXT: .LBB12_26:
+; RV32I-NEXT: or s2, s3, s2
+; RV32I-NEXT: lbu ra, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or s8, a3, s8
+; RV32I-NEXT: bgeu a4, t6, .LBB12_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or s0, a7, t2
+; RV32I-NEXT: or s1, a5, t4
+; RV32I-NEXT: .LBB12_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: or a5, s4, ra
+; RV32I-NEXT: slli t4, s2, 16
+; RV32I-NEXT: or s2, s5, s3
+; RV32I-NEXT: slli s3, s8, 16
+; RV32I-NEXT: mv s4, t0
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: beqz a4, .LBB12_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv s4, s0
+; RV32I-NEXT: mv a7, s1
+; RV32I-NEXT: .LBB12_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s1, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s8, t2, 8
+; RV32I-NEXT: or t4, t4, a5
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: bltu a4, t6, .LBB12_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB12_32:
+; RV32I-NEXT: slli s3, ra, 8
+; RV32I-NEXT: or a5, s5, a3
+; RV32I-NEXT: lbu s5, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a3, s8, s0
+; RV32I-NEXT: srl s2, t4, a4
+; RV32I-NEXT: sll ra, t2, s6
+; RV32I-NEXT: bltu a4, t3, .LBB12_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s0, t2, a4
+; RV32I-NEXT: j .LBB12_35
+; RV32I-NEXT: .LBB12_34:
+; RV32I-NEXT: or s0, s2, ra
+; RV32I-NEXT: .LBB12_35:
+; RV32I-NEXT: or s3, s3, s5
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a4, .LBB12_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s0
+; RV32I-NEXT: .LBB12_37:
+; RV32I-NEXT: or s0, a5, s3
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: bltu a4, t3, .LBB12_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: srl a3, a0, a4
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: bnez a4, .LBB12_40
+; RV32I-NEXT: j .LBB12_41
+; RV32I-NEXT: .LBB12_39:
+; RV32I-NEXT: srl s8, t2, a4
+; RV32I-NEXT: srl a3, s0, a4
+; RV32I-NEXT: sll a5, a0, s6
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: beqz a4, .LBB12_41
+; RV32I-NEXT: .LBB12_40:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB12_41:
+; RV32I-NEXT: bltu a4, t3, .LBB12_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t3, .LBB12_45
+; RV32I-NEXT: .LBB12_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: bnez s7, .LBB12_46
+; RV32I-NEXT: j .LBB12_47
+; RV32I-NEXT: .LBB12_44:
+; RV32I-NEXT: srl s1, a0, a4
+; RV32I-NEXT: bltu s7, t3, .LBB12_43
+; RV32I-NEXT: .LBB12_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: beqz s7, .LBB12_47
+; RV32I-NEXT: .LBB12_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB12_47:
+; RV32I-NEXT: bltu s9, t3, .LBB12_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t2, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB12_50
+; RV32I-NEXT: j .LBB12_51
+; RV32I-NEXT: .LBB12_49:
+; RV32I-NEXT: sll a3, t2, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB12_51
+; RV32I-NEXT: .LBB12_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB12_51:
+; RV32I-NEXT: bltu s9, t3, .LBB12_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a4, t6, .LBB12_54
+; RV32I-NEXT: j .LBB12_55
+; RV32I-NEXT: .LBB12_53:
+; RV32I-NEXT: srl s7, t2, a4
+; RV32I-NEXT: bgeu a4, t6, .LBB12_55
+; RV32I-NEXT: .LBB12_54:
+; RV32I-NEXT: or s2, a5, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB12_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a4, .LBB12_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a5, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB12_57:
+; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a4
+; RV32I-NEXT: bltu a4, t6, .LBB12_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: .LBB12_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a5, t0, s3
+; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB12_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a3, t0, s2
+; RV32I-NEXT: j .LBB12_62
+; RV32I-NEXT: .LBB12_61:
+; RV32I-NEXT: sll s10, t0, s6
+; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: .LBB12_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a6
+; RV32I-NEXT: beqz s2, .LBB12_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB12_64:
+; RV32I-NEXT: bltu s1, t3, .LBB12_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a6, s1
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez s1, .LBB12_67
+; RV32I-NEXT: j .LBB12_68
+; RV32I-NEXT: .LBB12_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a6, a3
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz s1, .LBB12_68
+; RV32I-NEXT: .LBB12_67:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB12_68:
+; RV32I-NEXT: bltu s1, t3, .LBB12_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t3, .LBB12_72
+; RV32I-NEXT: .LBB12_70:
+; RV32I-NEXT: sll s6, t1, s6
+; RV32I-NEXT: srl a3, t1, s3
+; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB12_73
+; RV32I-NEXT: .LBB12_71:
+; RV32I-NEXT: srl s1, a6, s3
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB12_70
+; RV32I-NEXT: .LBB12_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: .LBB12_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, t5
+; RV32I-NEXT: beqz s2, .LBB12_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB12_75:
+; RV32I-NEXT: bltu s9, t3, .LBB12_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t0, s9
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: bnez s9, .LBB12_78
+; RV32I-NEXT: j .LBB12_79
+; RV32I-NEXT: .LBB12_77:
+; RV32I-NEXT: sll s3, t0, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t0, a3
+; RV32I-NEXT: sll s7, a6, s2
+; RV32I-NEXT: or a3, a3, s7
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: beqz s9, .LBB12_79
+; RV32I-NEXT: .LBB12_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB12_79:
+; RV32I-NEXT: bltu s2, t6, .LBB12_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB12_82
+; RV32I-NEXT: .LBB12_81:
+; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s3, a5, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB12_82:
+; RV32I-NEXT: addi ra, a4, -128
+; RV32I-NEXT: mv s5, t1
+; RV32I-NEXT: mv s6, t5
+; RV32I-NEXT: beqz s2, .LBB12_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s5, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB12_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, t5, s9
+; RV32I-NEXT: bltu ra, t3, .LBB12_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: srl a3, t5, ra
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: bnez ra, .LBB12_87
+; RV32I-NEXT: j .LBB12_88
+; RV32I-NEXT: .LBB12_86:
+; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: beqz ra, .LBB12_88
+; RV32I-NEXT: .LBB12_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB12_88:
+; RV32I-NEXT: bltu ra, t3, .LBB12_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: srl a3, a6, ra
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez ra, .LBB12_91
+; RV32I-NEXT: j .LBB12_92
+; RV32I-NEXT: .LBB12_90:
+; RV32I-NEXT: srl s2, t5, a4
+; RV32I-NEXT: sll a3, a6, s9
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz ra, .LBB12_92
+; RV32I-NEXT: .LBB12_91:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB12_92:
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: bltu ra, t3, .LBB12_95
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bgeu s10, t3, .LBB12_96
+; RV32I-NEXT: .LBB12_94:
+; RV32I-NEXT: sll s9, t1, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB12_97
+; RV32I-NEXT: .LBB12_95:
+; RV32I-NEXT: srl s7, a6, a4
+; RV32I-NEXT: bltu s10, t3, .LBB12_94
+; RV32I-NEXT: .LBB12_96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t1, s10
+; RV32I-NEXT: .LBB12_97:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, t5
+; RV32I-NEXT: beqz s10, .LBB12_99
+; RV32I-NEXT: # %bb.98:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB12_99:
+; RV32I-NEXT: bltu s11, t3, .LBB12_101
+; RV32I-NEXT: # %bb.100:
+; RV32I-NEXT: srl a3, t5, s11
+; RV32I-NEXT: bnez s11, .LBB12_102
+; RV32I-NEXT: j .LBB12_103
+; RV32I-NEXT: .LBB12_101:
+; RV32I-NEXT: srl a3, t1, ra
+; RV32I-NEXT: neg s10, s11
+; RV32I-NEXT: sll s10, t5, s10
+; RV32I-NEXT: or a3, a3, s10
+; RV32I-NEXT: beqz s11, .LBB12_103
+; RV32I-NEXT: .LBB12_102:
+; RV32I-NEXT: mv t1, a3
+; RV32I-NEXT: .LBB12_103:
+; RV32I-NEXT: bltu s11, t3, .LBB12_105
+; RV32I-NEXT: # %bb.104:
+; RV32I-NEXT: li t3, 0
+; RV32I-NEXT: bltu ra, t6, .LBB12_106
+; RV32I-NEXT: j .LBB12_107
+; RV32I-NEXT: .LBB12_105:
+; RV32I-NEXT: srl t3, t5, ra
+; RV32I-NEXT: bgeu ra, t6, .LBB12_107
+; RV32I-NEXT: .LBB12_106:
+; RV32I-NEXT: or t1, a5, s9
+; RV32I-NEXT: or t3, s7, s3
+; RV32I-NEXT: .LBB12_107:
+; RV32I-NEXT: li a5, 128
+; RV32I-NEXT: bnez ra, .LBB12_114
+; RV32I-NEXT: # %bb.108:
+; RV32I-NEXT: bgeu ra, t6, .LBB12_115
+; RV32I-NEXT: .LBB12_109:
+; RV32I-NEXT: bltu a4, a5, .LBB12_116
+; RV32I-NEXT: .LBB12_110:
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bnez a4, .LBB12_117
+; RV32I-NEXT: .LBB12_111:
+; RV32I-NEXT: bltu a4, a5, .LBB12_113
+; RV32I-NEXT: .LBB12_112:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: .LBB12_113:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli t0, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t3, t2, 16
+; RV32I-NEXT: srli s3, t2, 24
+; RV32I-NEXT: srli s1, s4, 16
+; RV32I-NEXT: srli a3, s4, 24
+; RV32I-NEXT: srli t6, a7, 16
+; RV32I-NEXT: srli s6, a7, 24
+; RV32I-NEXT: srli s5, a1, 16
+; RV32I-NEXT: srli s7, a1, 24
+; RV32I-NEXT: srli s8, ra, 16
+; RV32I-NEXT: srli s9, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s10, s0, t1
+; RV32I-NEXT: and s11, a0, t1
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli t0, s11, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t2, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, s4, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t2, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, a7, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s4, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a3, a1, t1
+; RV32I-NEXT: and a4, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb a1, 24(a2)
+; RV32I-NEXT: sb a3, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB12_114:
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: bltu ra, t6, .LBB12_109
+; RV32I-NEXT: .LBB12_115:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: bgeu a4, a5, .LBB12_110
+; RV32I-NEXT: .LBB12_116:
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t0, a3, a6
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a3, s8
+; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s5
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB12_111
+; RV32I-NEXT: .LBB12_117:
+; RV32I-NEXT: mv s0, t0
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t2, s2
+; RV32I-NEXT: bgeu a4, a5, .LBB12_112
+; RV32I-NEXT: j .LBB12_113
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or t0, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t3, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li a7, 64
+; RV64I-NEXT: slli t4, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a6, t1, t0
+; RV64I-NEXT: or t0, t5, t3
+; RV64I-NEXT: or a5, s0, t6
+; RV64I-NEXT: slli a5, a5, 5
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sll t3, t0, t5
+; RV64I-NEXT: bltu a5, a7, .LBB13_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: srl t6, t0, t1
+; RV64I-NEXT: j .LBB13_3
+; RV64I-NEXT: .LBB13_2:
+; RV64I-NEXT: srl t6, a6, a5
+; RV64I-NEXT: or t6, t6, t3
+; RV64I-NEXT: .LBB13_3:
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli t4, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a6
+; RV64I-NEXT: beqz a5, .LBB13_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB13_5:
+; RV64I-NEXT: or a4, t4, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a5, a7, .LBB13_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: srl t4, a3, t1
+; RV64I-NEXT: j .LBB13_8
+; RV64I-NEXT: .LBB13_7:
+; RV64I-NEXT: srl a0, t0, a5
+; RV64I-NEXT: srl t1, a4, a5
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t4, t1, t2
+; RV64I-NEXT: .LBB13_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a5, .LBB13_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t4
+; RV64I-NEXT: .LBB13_10:
+; RV64I-NEXT: sub t6, t1, a5
+; RV64I-NEXT: bltu a5, a7, .LBB13_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t4, 0
+; RV64I-NEXT: bgeu t6, a7, .LBB13_14
+; RV64I-NEXT: .LBB13_12:
+; RV64I-NEXT: sll t5, a6, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a6, s0
+; RV64I-NEXT: or s1, s0, t3
+; RV64I-NEXT: j .LBB13_15
+; RV64I-NEXT: .LBB13_13:
+; RV64I-NEXT: srl t4, a3, a5
+; RV64I-NEXT: bltu t6, a7, .LBB13_12
+; RV64I-NEXT: .LBB13_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: sll s1, a6, t3
+; RV64I-NEXT: .LBB13_15:
+; RV64I-NEXT: sub s0, a5, t1
+; RV64I-NEXT: mv t3, t0
+; RV64I-NEXT: beqz t6, .LBB13_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t3, s1
+; RV64I-NEXT: .LBB13_17:
+; RV64I-NEXT: bltu s0, a7, .LBB13_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: srl t6, t0, t6
+; RV64I-NEXT: bnez s0, .LBB13_20
+; RV64I-NEXT: j .LBB13_21
+; RV64I-NEXT: .LBB13_19:
+; RV64I-NEXT: srl t6, a6, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, t0, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB13_21
+; RV64I-NEXT: .LBB13_20:
+; RV64I-NEXT: mv a6, t6
+; RV64I-NEXT: .LBB13_21:
+; RV64I-NEXT: bltu s0, a7, .LBB13_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: li a7, 0
+; RV64I-NEXT: bltu a5, t1, .LBB13_24
+; RV64I-NEXT: j .LBB13_25
+; RV64I-NEXT: .LBB13_23:
+; RV64I-NEXT: srl a7, t0, s0
+; RV64I-NEXT: bgeu a5, t1, .LBB13_25
+; RV64I-NEXT: .LBB13_24:
+; RV64I-NEXT: or a6, t2, t5
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: .LBB13_25:
+; RV64I-NEXT: bnez a5, .LBB13_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a5, t1, .LBB13_28
+; RV64I-NEXT: .LBB13_27:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: .LBB13_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB13_29:
+; RV64I-NEXT: mv a4, a6
+; RV64I-NEXT: mv a3, a7
+; RV64I-NEXT: bgeu a5, t1, .LBB13_27
+; RV64I-NEXT: j .LBB13_28
+;
+; RV32I-LABEL: lshr_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a6, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a4, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t4, t4, t3
+; RV32I-NEXT: or t5, t6, t5
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t3, 32
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a1, a4, 16
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a4, t1, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: or t1, t5, t4
+; RV32I-NEXT: or t5, a4, a7
+; RV32I-NEXT: or a4, t2, t0
+; RV32I-NEXT: slli a4, a4, 5
+; RV32I-NEXT: srl s0, t1, a4
+; RV32I-NEXT: neg s6, a4
+; RV32I-NEXT: sll t4, t5, s6
+; RV32I-NEXT: bltu a4, t3, .LBB13_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, t5, a4
+; RV32I-NEXT: j .LBB13_3
+; RV32I-NEXT: .LBB13_2:
+; RV32I-NEXT: or a7, s0, t4
+; RV32I-NEXT: .LBB13_3:
+; RV32I-NEXT: or t0, a6, a3
+; RV32I-NEXT: or a6, a1, a5
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: beqz a4, .LBB13_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a1, a7
+; RV32I-NEXT: .LBB13_5:
+; RV32I-NEXT: srl a3, t0, a4
+; RV32I-NEXT: sll a5, a6, s6
+; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t3, .LBB13_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: srl a3, a6, a4
+; RV32I-NEXT: j .LBB13_8
+; RV32I-NEXT: .LBB13_7:
+; RV32I-NEXT: srl ra, t5, a4
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: .LBB13_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv a7, t0
+; RV32I-NEXT: beqz a4, .LBB13_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a7, a3
+; RV32I-NEXT: .LBB13_10:
+; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s7, t6, a4
+; RV32I-NEXT: bltu a4, t3, .LBB13_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: j .LBB13_13
+; RV32I-NEXT: .LBB13_12:
+; RV32I-NEXT: srl a5, a6, a4
+; RV32I-NEXT: .LBB13_13:
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t3, .LBB13_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: sll a3, t1, s7
+; RV32I-NEXT: j .LBB13_16
+; RV32I-NEXT: .LBB13_15:
+; RV32I-NEXT: sll t2, t1, s6
+; RV32I-NEXT: srl a3, t1, s10
+; RV32I-NEXT: or a3, a3, t4
+; RV32I-NEXT: .LBB13_16:
+; RV32I-NEXT: addi s9, a4, -64
+; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: beqz s7, .LBB13_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t4, a3
+; RV32I-NEXT: .LBB13_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB13_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: srl s2, t5, s9
+; RV32I-NEXT: j .LBB13_21
+; RV32I-NEXT: .LBB13_20:
+; RV32I-NEXT: sll a3, t5, s11
+; RV32I-NEXT: or s2, s0, a3
+; RV32I-NEXT: .LBB13_21:
+; RV32I-NEXT: lbu s1, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: beqz s9, .LBB13_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv s0, s2
+; RV32I-NEXT: .LBB13_23:
+; RV32I-NEXT: lbu s4, 9(a0)
+; RV32I-NEXT: lbu s2, 10(a0)
+; RV32I-NEXT: lbu s5, 13(a0)
+; RV32I-NEXT: lbu s8, 14(a0)
+; RV32I-NEXT: slli s3, s1, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB13_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: j .LBB13_26
+; RV32I-NEXT: .LBB13_25:
+; RV32I-NEXT: srl s1, t5, a4
+; RV32I-NEXT: .LBB13_26:
+; RV32I-NEXT: or s2, s3, s2
+; RV32I-NEXT: lbu ra, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or s8, a3, s8
+; RV32I-NEXT: bgeu a4, t6, .LBB13_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or s0, a7, t2
+; RV32I-NEXT: or s1, a5, t4
+; RV32I-NEXT: .LBB13_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: or a5, s4, ra
+; RV32I-NEXT: slli t4, s2, 16
+; RV32I-NEXT: or s2, s5, s3
+; RV32I-NEXT: slli s3, s8, 16
+; RV32I-NEXT: mv s4, t0
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: beqz a4, .LBB13_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv s4, s0
+; RV32I-NEXT: mv a7, s1
+; RV32I-NEXT: .LBB13_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s1, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s8, t2, 8
+; RV32I-NEXT: or t4, t4, a5
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: bltu a4, t6, .LBB13_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB13_32:
+; RV32I-NEXT: slli s3, ra, 8
+; RV32I-NEXT: or a5, s5, a3
+; RV32I-NEXT: lbu s5, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a3, s8, s0
+; RV32I-NEXT: srl s2, t4, a4
+; RV32I-NEXT: sll ra, t2, s6
+; RV32I-NEXT: bltu a4, t3, .LBB13_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s0, t2, a4
+; RV32I-NEXT: j .LBB13_35
+; RV32I-NEXT: .LBB13_34:
+; RV32I-NEXT: or s0, s2, ra
+; RV32I-NEXT: .LBB13_35:
+; RV32I-NEXT: or s3, s3, s5
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a4, .LBB13_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s0
+; RV32I-NEXT: .LBB13_37:
+; RV32I-NEXT: or s0, a5, s3
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: bltu a4, t3, .LBB13_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: srl a3, a0, a4
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: bnez a4, .LBB13_40
+; RV32I-NEXT: j .LBB13_41
+; RV32I-NEXT: .LBB13_39:
+; RV32I-NEXT: srl s8, t2, a4
+; RV32I-NEXT: srl a3, s0, a4
+; RV32I-NEXT: sll a5, a0, s6
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: beqz a4, .LBB13_41
+; RV32I-NEXT: .LBB13_40:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB13_41:
+; RV32I-NEXT: bltu a4, t3, .LBB13_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t3, .LBB13_45
+; RV32I-NEXT: .LBB13_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: bnez s7, .LBB13_46
+; RV32I-NEXT: j .LBB13_47
+; RV32I-NEXT: .LBB13_44:
+; RV32I-NEXT: srl s1, a0, a4
+; RV32I-NEXT: bltu s7, t3, .LBB13_43
+; RV32I-NEXT: .LBB13_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: beqz s7, .LBB13_47
+; RV32I-NEXT: .LBB13_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB13_47:
+; RV32I-NEXT: bltu s9, t3, .LBB13_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t2, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB13_50
+; RV32I-NEXT: j .LBB13_51
+; RV32I-NEXT: .LBB13_49:
+; RV32I-NEXT: sll a3, t2, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB13_51
+; RV32I-NEXT: .LBB13_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB13_51:
+; RV32I-NEXT: bltu s9, t3, .LBB13_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a4, t6, .LBB13_54
+; RV32I-NEXT: j .LBB13_55
+; RV32I-NEXT: .LBB13_53:
+; RV32I-NEXT: srl s7, t2, a4
+; RV32I-NEXT: bgeu a4, t6, .LBB13_55
+; RV32I-NEXT: .LBB13_54:
+; RV32I-NEXT: or s2, a5, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB13_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a4, .LBB13_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a5, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB13_57:
+; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a4
+; RV32I-NEXT: bltu a4, t6, .LBB13_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: .LBB13_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a5, t0, s3
+; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB13_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a3, t0, s2
+; RV32I-NEXT: j .LBB13_62
+; RV32I-NEXT: .LBB13_61:
+; RV32I-NEXT: sll s10, t0, s6
+; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: .LBB13_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a6
+; RV32I-NEXT: beqz s2, .LBB13_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB13_64:
+; RV32I-NEXT: bltu s1, t3, .LBB13_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a6, s1
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez s1, .LBB13_67
+; RV32I-NEXT: j .LBB13_68
+; RV32I-NEXT: .LBB13_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a6, a3
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz s1, .LBB13_68
+; RV32I-NEXT: .LBB13_67:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB13_68:
+; RV32I-NEXT: bltu s1, t3, .LBB13_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t3, .LBB13_72
+; RV32I-NEXT: .LBB13_70:
+; RV32I-NEXT: sll s6, t1, s6
+; RV32I-NEXT: srl a3, t1, s3
+; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB13_73
+; RV32I-NEXT: .LBB13_71:
+; RV32I-NEXT: srl s1, a6, s3
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB13_70
+; RV32I-NEXT: .LBB13_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: .LBB13_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, t5
+; RV32I-NEXT: beqz s2, .LBB13_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB13_75:
+; RV32I-NEXT: bltu s9, t3, .LBB13_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t0, s9
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: bnez s9, .LBB13_78
+; RV32I-NEXT: j .LBB13_79
+; RV32I-NEXT: .LBB13_77:
+; RV32I-NEXT: sll s3, t0, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t0, a3
+; RV32I-NEXT: sll s7, a6, s2
+; RV32I-NEXT: or a3, a3, s7
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: beqz s9, .LBB13_79
+; RV32I-NEXT: .LBB13_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB13_79:
+; RV32I-NEXT: bltu s2, t6, .LBB13_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB13_82
+; RV32I-NEXT: .LBB13_81:
+; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s3, a5, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB13_82:
+; RV32I-NEXT: addi ra, a4, -128
+; RV32I-NEXT: mv s5, t1
+; RV32I-NEXT: mv s6, t5
+; RV32I-NEXT: beqz s2, .LBB13_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s5, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB13_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, t5, s9
+; RV32I-NEXT: bltu ra, t3, .LBB13_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: srl a3, t5, ra
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: bnez ra, .LBB13_87
+; RV32I-NEXT: j .LBB13_88
+; RV32I-NEXT: .LBB13_86:
+; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: beqz ra, .LBB13_88
+; RV32I-NEXT: .LBB13_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB13_88:
+; RV32I-NEXT: bltu ra, t3, .LBB13_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: srl a3, a6, ra
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez ra, .LBB13_91
+; RV32I-NEXT: j .LBB13_92
+; RV32I-NEXT: .LBB13_90:
+; RV32I-NEXT: srl s2, t5, a4
+; RV32I-NEXT: sll a3, a6, s9
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz ra, .LBB13_92
+; RV32I-NEXT: .LBB13_91:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB13_92:
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: bltu ra, t3, .LBB13_95
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bgeu s10, t3, .LBB13_96
+; RV32I-NEXT: .LBB13_94:
+; RV32I-NEXT: sll s9, t1, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB13_97
+; RV32I-NEXT: .LBB13_95:
+; RV32I-NEXT: srl s7, a6, a4
+; RV32I-NEXT: bltu s10, t3, .LBB13_94
+; RV32I-NEXT: .LBB13_96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t1, s10
+; RV32I-NEXT: .LBB13_97:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, t5
+; RV32I-NEXT: beqz s10, .LBB13_99
+; RV32I-NEXT: # %bb.98:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB13_99:
+; RV32I-NEXT: bltu s11, t3, .LBB13_101
+; RV32I-NEXT: # %bb.100:
+; RV32I-NEXT: srl a3, t5, s11
+; RV32I-NEXT: bnez s11, .LBB13_102
+; RV32I-NEXT: j .LBB13_103
+; RV32I-NEXT: .LBB13_101:
+; RV32I-NEXT: srl a3, t1, ra
+; RV32I-NEXT: neg s10, s11
+; RV32I-NEXT: sll s10, t5, s10
+; RV32I-NEXT: or a3, a3, s10
+; RV32I-NEXT: beqz s11, .LBB13_103
+; RV32I-NEXT: .LBB13_102:
+; RV32I-NEXT: mv t1, a3
+; RV32I-NEXT: .LBB13_103:
+; RV32I-NEXT: bltu s11, t3, .LBB13_105
+; RV32I-NEXT: # %bb.104:
+; RV32I-NEXT: li t3, 0
+; RV32I-NEXT: bltu ra, t6, .LBB13_106
+; RV32I-NEXT: j .LBB13_107
+; RV32I-NEXT: .LBB13_105:
+; RV32I-NEXT: srl t3, t5, ra
+; RV32I-NEXT: bgeu ra, t6, .LBB13_107
+; RV32I-NEXT: .LBB13_106:
+; RV32I-NEXT: or t1, a5, s9
+; RV32I-NEXT: or t3, s7, s3
+; RV32I-NEXT: .LBB13_107:
+; RV32I-NEXT: li a5, 128
+; RV32I-NEXT: bnez ra, .LBB13_114
+; RV32I-NEXT: # %bb.108:
+; RV32I-NEXT: bgeu ra, t6, .LBB13_115
+; RV32I-NEXT: .LBB13_109:
+; RV32I-NEXT: bltu a4, a5, .LBB13_116
+; RV32I-NEXT: .LBB13_110:
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bnez a4, .LBB13_117
+; RV32I-NEXT: .LBB13_111:
+; RV32I-NEXT: bltu a4, a5, .LBB13_113
+; RV32I-NEXT: .LBB13_112:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: .LBB13_113:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli t0, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t3, t2, 16
+; RV32I-NEXT: srli s3, t2, 24
+; RV32I-NEXT: srli s1, s4, 16
+; RV32I-NEXT: srli a3, s4, 24
+; RV32I-NEXT: srli t6, a7, 16
+; RV32I-NEXT: srli s6, a7, 24
+; RV32I-NEXT: srli s5, a1, 16
+; RV32I-NEXT: srli s7, a1, 24
+; RV32I-NEXT: srli s8, ra, 16
+; RV32I-NEXT: srli s9, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s10, s0, t1
+; RV32I-NEXT: and s11, a0, t1
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli t0, s11, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t2, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, s4, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t2, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, a7, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s4, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a3, a1, t1
+; RV32I-NEXT: and a4, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb a1, 24(a2)
+; RV32I-NEXT: sb a3, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB13_114:
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: bltu ra, t6, .LBB13_109
+; RV32I-NEXT: .LBB13_115:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: bgeu a4, a5, .LBB13_110
+; RV32I-NEXT: .LBB13_116:
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t0, a3, a6
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a3, s8
+; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s5
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB13_111
+; RV32I-NEXT: .LBB13_117:
+; RV32I-NEXT: mv s0, t0
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t2, s2
+; RV32I-NEXT: bgeu a4, a5, .LBB13_112
+; RV32I-NEXT: j .LBB13_113
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or t0, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t3, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li a7, 64
+; RV64I-NEXT: slli t4, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a6, t1, t0
+; RV64I-NEXT: or t0, t5, t3
+; RV64I-NEXT: or a5, s0, t6
+; RV64I-NEXT: slli a5, a5, 6
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sll t3, t0, t5
+; RV64I-NEXT: bltu a5, a7, .LBB14_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: srl t6, t0, t1
+; RV64I-NEXT: j .LBB14_3
+; RV64I-NEXT: .LBB14_2:
+; RV64I-NEXT: srl t6, a6, a5
+; RV64I-NEXT: or t6, t6, t3
+; RV64I-NEXT: .LBB14_3:
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli t4, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a6
+; RV64I-NEXT: beqz a5, .LBB14_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB14_5:
+; RV64I-NEXT: or a4, t4, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a5, a7, .LBB14_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: srl t4, a3, t1
+; RV64I-NEXT: j .LBB14_8
+; RV64I-NEXT: .LBB14_7:
+; RV64I-NEXT: srl a0, t0, a5
+; RV64I-NEXT: srl t1, a4, a5
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t4, t1, t2
+; RV64I-NEXT: .LBB14_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a5, .LBB14_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t4
+; RV64I-NEXT: .LBB14_10:
+; RV64I-NEXT: sub t6, t1, a5
+; RV64I-NEXT: bltu a5, a7, .LBB14_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t4, 0
+; RV64I-NEXT: bgeu t6, a7, .LBB14_14
+; RV64I-NEXT: .LBB14_12:
+; RV64I-NEXT: sll t5, a6, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a6, s0
+; RV64I-NEXT: or s1, s0, t3
+; RV64I-NEXT: j .LBB14_15
+; RV64I-NEXT: .LBB14_13:
+; RV64I-NEXT: srl t4, a3, a5
+; RV64I-NEXT: bltu t6, a7, .LBB14_12
+; RV64I-NEXT: .LBB14_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: sll s1, a6, t3
+; RV64I-NEXT: .LBB14_15:
+; RV64I-NEXT: sub s0, a5, t1
+; RV64I-NEXT: mv t3, t0
+; RV64I-NEXT: beqz t6, .LBB14_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t3, s1
+; RV64I-NEXT: .LBB14_17:
+; RV64I-NEXT: bltu s0, a7, .LBB14_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: srl t6, t0, t6
+; RV64I-NEXT: bnez s0, .LBB14_20
+; RV64I-NEXT: j .LBB14_21
+; RV64I-NEXT: .LBB14_19:
+; RV64I-NEXT: srl t6, a6, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, t0, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB14_21
+; RV64I-NEXT: .LBB14_20:
+; RV64I-NEXT: mv a6, t6
+; RV64I-NEXT: .LBB14_21:
+; RV64I-NEXT: bltu s0, a7, .LBB14_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: li a7, 0
+; RV64I-NEXT: bltu a5, t1, .LBB14_24
+; RV64I-NEXT: j .LBB14_25
+; RV64I-NEXT: .LBB14_23:
+; RV64I-NEXT: srl a7, t0, s0
+; RV64I-NEXT: bgeu a5, t1, .LBB14_25
+; RV64I-NEXT: .LBB14_24:
+; RV64I-NEXT: or a6, t2, t5
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: .LBB14_25:
+; RV64I-NEXT: bnez a5, .LBB14_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a5, t1, .LBB14_28
+; RV64I-NEXT: .LBB14_27:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: .LBB14_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB14_29:
+; RV64I-NEXT: mv a4, a6
+; RV64I-NEXT: mv a3, a7
+; RV64I-NEXT: bgeu a5, t1, .LBB14_27
+; RV64I-NEXT: j .LBB14_28
+;
+; RV32I-LABEL: lshr_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a6, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a4, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t4, t4, t3
+; RV32I-NEXT: or t5, t6, t5
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t3, 32
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a1, a4, 16
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a4, t1, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: or t1, t5, t4
+; RV32I-NEXT: or t5, a4, a7
+; RV32I-NEXT: or a4, t2, t0
+; RV32I-NEXT: slli a4, a4, 6
+; RV32I-NEXT: srl s0, t1, a4
+; RV32I-NEXT: neg s6, a4
+; RV32I-NEXT: sll t4, t5, s6
+; RV32I-NEXT: bltu a4, t3, .LBB14_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, t5, a4
+; RV32I-NEXT: j .LBB14_3
+; RV32I-NEXT: .LBB14_2:
+; RV32I-NEXT: or a7, s0, t4
+; RV32I-NEXT: .LBB14_3:
+; RV32I-NEXT: or t0, a6, a3
+; RV32I-NEXT: or a6, a1, a5
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: beqz a4, .LBB14_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a1, a7
+; RV32I-NEXT: .LBB14_5:
+; RV32I-NEXT: srl a3, t0, a4
+; RV32I-NEXT: sll a5, a6, s6
+; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t3, .LBB14_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: srl a3, a6, a4
+; RV32I-NEXT: j .LBB14_8
+; RV32I-NEXT: .LBB14_7:
+; RV32I-NEXT: srl ra, t5, a4
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: .LBB14_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv a7, t0
+; RV32I-NEXT: beqz a4, .LBB14_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a7, a3
+; RV32I-NEXT: .LBB14_10:
+; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s7, t6, a4
+; RV32I-NEXT: bltu a4, t3, .LBB14_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: j .LBB14_13
+; RV32I-NEXT: .LBB14_12:
+; RV32I-NEXT: srl a5, a6, a4
+; RV32I-NEXT: .LBB14_13:
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t3, .LBB14_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: sll a3, t1, s7
+; RV32I-NEXT: j .LBB14_16
+; RV32I-NEXT: .LBB14_15:
+; RV32I-NEXT: sll t2, t1, s6
+; RV32I-NEXT: srl a3, t1, s10
+; RV32I-NEXT: or a3, a3, t4
+; RV32I-NEXT: .LBB14_16:
+; RV32I-NEXT: addi s9, a4, -64
+; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: beqz s7, .LBB14_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t4, a3
+; RV32I-NEXT: .LBB14_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB14_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: srl s2, t5, s9
+; RV32I-NEXT: j .LBB14_21
+; RV32I-NEXT: .LBB14_20:
+; RV32I-NEXT: sll a3, t5, s11
+; RV32I-NEXT: or s2, s0, a3
+; RV32I-NEXT: .LBB14_21:
+; RV32I-NEXT: lbu s1, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: beqz s9, .LBB14_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv s0, s2
+; RV32I-NEXT: .LBB14_23:
+; RV32I-NEXT: lbu s4, 9(a0)
+; RV32I-NEXT: lbu s2, 10(a0)
+; RV32I-NEXT: lbu s5, 13(a0)
+; RV32I-NEXT: lbu s8, 14(a0)
+; RV32I-NEXT: slli s3, s1, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB14_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: j .LBB14_26
+; RV32I-NEXT: .LBB14_25:
+; RV32I-NEXT: srl s1, t5, a4
+; RV32I-NEXT: .LBB14_26:
+; RV32I-NEXT: or s2, s3, s2
+; RV32I-NEXT: lbu ra, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or s8, a3, s8
+; RV32I-NEXT: bgeu a4, t6, .LBB14_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or s0, a7, t2
+; RV32I-NEXT: or s1, a5, t4
+; RV32I-NEXT: .LBB14_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: or a5, s4, ra
+; RV32I-NEXT: slli t4, s2, 16
+; RV32I-NEXT: or s2, s5, s3
+; RV32I-NEXT: slli s3, s8, 16
+; RV32I-NEXT: mv s4, t0
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: beqz a4, .LBB14_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv s4, s0
+; RV32I-NEXT: mv a7, s1
+; RV32I-NEXT: .LBB14_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s1, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s8, t2, 8
+; RV32I-NEXT: or t4, t4, a5
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: bltu a4, t6, .LBB14_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB14_32:
+; RV32I-NEXT: slli s3, ra, 8
+; RV32I-NEXT: or a5, s5, a3
+; RV32I-NEXT: lbu s5, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a3, s8, s0
+; RV32I-NEXT: srl s2, t4, a4
+; RV32I-NEXT: sll ra, t2, s6
+; RV32I-NEXT: bltu a4, t3, .LBB14_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s0, t2, a4
+; RV32I-NEXT: j .LBB14_35
+; RV32I-NEXT: .LBB14_34:
+; RV32I-NEXT: or s0, s2, ra
+; RV32I-NEXT: .LBB14_35:
+; RV32I-NEXT: or s3, s3, s5
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a4, .LBB14_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s0
+; RV32I-NEXT: .LBB14_37:
+; RV32I-NEXT: or s0, a5, s3
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: bltu a4, t3, .LBB14_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: srl a3, a0, a4
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: bnez a4, .LBB14_40
+; RV32I-NEXT: j .LBB14_41
+; RV32I-NEXT: .LBB14_39:
+; RV32I-NEXT: srl s8, t2, a4
+; RV32I-NEXT: srl a3, s0, a4
+; RV32I-NEXT: sll a5, a0, s6
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: beqz a4, .LBB14_41
+; RV32I-NEXT: .LBB14_40:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB14_41:
+; RV32I-NEXT: bltu a4, t3, .LBB14_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t3, .LBB14_45
+; RV32I-NEXT: .LBB14_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: bnez s7, .LBB14_46
+; RV32I-NEXT: j .LBB14_47
+; RV32I-NEXT: .LBB14_44:
+; RV32I-NEXT: srl s1, a0, a4
+; RV32I-NEXT: bltu s7, t3, .LBB14_43
+; RV32I-NEXT: .LBB14_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: beqz s7, .LBB14_47
+; RV32I-NEXT: .LBB14_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB14_47:
+; RV32I-NEXT: bltu s9, t3, .LBB14_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t2, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB14_50
+; RV32I-NEXT: j .LBB14_51
+; RV32I-NEXT: .LBB14_49:
+; RV32I-NEXT: sll a3, t2, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB14_51
+; RV32I-NEXT: .LBB14_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB14_51:
+; RV32I-NEXT: bltu s9, t3, .LBB14_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a4, t6, .LBB14_54
+; RV32I-NEXT: j .LBB14_55
+; RV32I-NEXT: .LBB14_53:
+; RV32I-NEXT: srl s7, t2, a4
+; RV32I-NEXT: bgeu a4, t6, .LBB14_55
+; RV32I-NEXT: .LBB14_54:
+; RV32I-NEXT: or s2, a5, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB14_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a4, .LBB14_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a5, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB14_57:
+; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a4
+; RV32I-NEXT: bltu a4, t6, .LBB14_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: .LBB14_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a5, t0, s3
+; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB14_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a3, t0, s2
+; RV32I-NEXT: j .LBB14_62
+; RV32I-NEXT: .LBB14_61:
+; RV32I-NEXT: sll s10, t0, s6
+; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: .LBB14_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a6
+; RV32I-NEXT: beqz s2, .LBB14_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB14_64:
+; RV32I-NEXT: bltu s1, t3, .LBB14_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a6, s1
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez s1, .LBB14_67
+; RV32I-NEXT: j .LBB14_68
+; RV32I-NEXT: .LBB14_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a6, a3
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz s1, .LBB14_68
+; RV32I-NEXT: .LBB14_67:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB14_68:
+; RV32I-NEXT: bltu s1, t3, .LBB14_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t3, .LBB14_72
+; RV32I-NEXT: .LBB14_70:
+; RV32I-NEXT: sll s6, t1, s6
+; RV32I-NEXT: srl a3, t1, s3
+; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB14_73
+; RV32I-NEXT: .LBB14_71:
+; RV32I-NEXT: srl s1, a6, s3
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB14_70
+; RV32I-NEXT: .LBB14_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: .LBB14_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, t5
+; RV32I-NEXT: beqz s2, .LBB14_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB14_75:
+; RV32I-NEXT: bltu s9, t3, .LBB14_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t0, s9
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: bnez s9, .LBB14_78
+; RV32I-NEXT: j .LBB14_79
+; RV32I-NEXT: .LBB14_77:
+; RV32I-NEXT: sll s3, t0, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t0, a3
+; RV32I-NEXT: sll s7, a6, s2
+; RV32I-NEXT: or a3, a3, s7
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: beqz s9, .LBB14_79
+; RV32I-NEXT: .LBB14_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB14_79:
+; RV32I-NEXT: bltu s2, t6, .LBB14_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB14_82
+; RV32I-NEXT: .LBB14_81:
+; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s3, a5, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB14_82:
+; RV32I-NEXT: addi ra, a4, -128
+; RV32I-NEXT: mv s5, t1
+; RV32I-NEXT: mv s6, t5
+; RV32I-NEXT: beqz s2, .LBB14_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s5, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB14_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, t5, s9
+; RV32I-NEXT: bltu ra, t3, .LBB14_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: srl a3, t5, ra
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: bnez ra, .LBB14_87
+; RV32I-NEXT: j .LBB14_88
+; RV32I-NEXT: .LBB14_86:
+; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: beqz ra, .LBB14_88
+; RV32I-NEXT: .LBB14_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB14_88:
+; RV32I-NEXT: bltu ra, t3, .LBB14_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: srl a3, a6, ra
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez ra, .LBB14_91
+; RV32I-NEXT: j .LBB14_92
+; RV32I-NEXT: .LBB14_90:
+; RV32I-NEXT: srl s2, t5, a4
+; RV32I-NEXT: sll a3, a6, s9
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz ra, .LBB14_92
+; RV32I-NEXT: .LBB14_91:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB14_92:
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: bltu ra, t3, .LBB14_95
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bgeu s10, t3, .LBB14_96
+; RV32I-NEXT: .LBB14_94:
+; RV32I-NEXT: sll s9, t1, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB14_97
+; RV32I-NEXT: .LBB14_95:
+; RV32I-NEXT: srl s7, a6, a4
+; RV32I-NEXT: bltu s10, t3, .LBB14_94
+; RV32I-NEXT: .LBB14_96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t1, s10
+; RV32I-NEXT: .LBB14_97:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, t5
+; RV32I-NEXT: beqz s10, .LBB14_99
+; RV32I-NEXT: # %bb.98:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB14_99:
+; RV32I-NEXT: bltu s11, t3, .LBB14_101
+; RV32I-NEXT: # %bb.100:
+; RV32I-NEXT: srl a3, t5, s11
+; RV32I-NEXT: bnez s11, .LBB14_102
+; RV32I-NEXT: j .LBB14_103
+; RV32I-NEXT: .LBB14_101:
+; RV32I-NEXT: srl a3, t1, ra
+; RV32I-NEXT: neg s10, s11
+; RV32I-NEXT: sll s10, t5, s10
+; RV32I-NEXT: or a3, a3, s10
+; RV32I-NEXT: beqz s11, .LBB14_103
+; RV32I-NEXT: .LBB14_102:
+; RV32I-NEXT: mv t1, a3
+; RV32I-NEXT: .LBB14_103:
+; RV32I-NEXT: bltu s11, t3, .LBB14_105
+; RV32I-NEXT: # %bb.104:
+; RV32I-NEXT: li t3, 0
+; RV32I-NEXT: bltu ra, t6, .LBB14_106
+; RV32I-NEXT: j .LBB14_107
+; RV32I-NEXT: .LBB14_105:
+; RV32I-NEXT: srl t3, t5, ra
+; RV32I-NEXT: bgeu ra, t6, .LBB14_107
+; RV32I-NEXT: .LBB14_106:
+; RV32I-NEXT: or t1, a5, s9
+; RV32I-NEXT: or t3, s7, s3
+; RV32I-NEXT: .LBB14_107:
+; RV32I-NEXT: li a5, 128
+; RV32I-NEXT: bnez ra, .LBB14_114
+; RV32I-NEXT: # %bb.108:
+; RV32I-NEXT: bgeu ra, t6, .LBB14_115
+; RV32I-NEXT: .LBB14_109:
+; RV32I-NEXT: bltu a4, a5, .LBB14_116
+; RV32I-NEXT: .LBB14_110:
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bnez a4, .LBB14_117
+; RV32I-NEXT: .LBB14_111:
+; RV32I-NEXT: bltu a4, a5, .LBB14_113
+; RV32I-NEXT: .LBB14_112:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: .LBB14_113:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli t0, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t3, t2, 16
+; RV32I-NEXT: srli s3, t2, 24
+; RV32I-NEXT: srli s1, s4, 16
+; RV32I-NEXT: srli a3, s4, 24
+; RV32I-NEXT: srli t6, a7, 16
+; RV32I-NEXT: srli s6, a7, 24
+; RV32I-NEXT: srli s5, a1, 16
+; RV32I-NEXT: srli s7, a1, 24
+; RV32I-NEXT: srli s8, ra, 16
+; RV32I-NEXT: srli s9, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s10, s0, t1
+; RV32I-NEXT: and s11, a0, t1
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli t0, s11, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t2, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, s4, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t2, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, a7, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s4, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a3, a1, t1
+; RV32I-NEXT: and a4, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb a1, 24(a2)
+; RV32I-NEXT: sb a3, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB14_114:
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: bltu ra, t6, .LBB14_109
+; RV32I-NEXT: .LBB14_115:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: bgeu a4, a5, .LBB14_110
+; RV32I-NEXT: .LBB14_116:
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t0, a3, a6
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a3, s8
+; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s5
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB14_111
+; RV32I-NEXT: .LBB14_117:
+; RV32I-NEXT: mv s0, t0
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t2, s2
+; RV32I-NEXT: bgeu a4, a5, .LBB14_112
+; RV32I-NEXT: j .LBB14_113
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -112
+; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 12(a0)
+; RV64I-NEXT: lbu s0, 13(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: lbu s2, 15(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or t6, t6, t5
+; RV64I-NEXT: or s0, s0, a4
+; RV64I-NEXT: or s1, s2, s1
+; RV64I-NEXT: lbu a4, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu t5, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: or t1, t3, t2
+; RV64I-NEXT: or t2, t4, a4
+; RV64I-NEXT: or a1, a1, t5
+; RV64I-NEXT: lbu t5, 19(a0)
+; RV64I-NEXT: lbu t4, 21(a0)
+; RV64I-NEXT: lbu a4, 22(a0)
+; RV64I-NEXT: lbu t3, 23(a0)
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t6, t6, 16
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: or s4, s3, a3
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a6, t6, a6
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: lbu t6, 29(a0)
+; RV64I-NEXT: lbu a3, 30(a0)
+; RV64I-NEXT: lbu s2, 31(a0)
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or s5, t1, t0
+; RV64I-NEXT: li a7, 128
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli s3, t3, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t1, a5, s4
+; RV64I-NEXT: or a5, s0, a6
+; RV64I-NEXT: or a6, a1, s5
+; RV64I-NEXT: slli a6, a6, 3
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: srl s0, t1, t3
+; RV64I-NEXT: bltu a6, t0, .LBB15_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sll s4, t1, t2
+; RV64I-NEXT: j .LBB15_3
+; RV64I-NEXT: .LBB15_2:
+; RV64I-NEXT: sll a1, t1, a6
+; RV64I-NEXT: sll s4, a5, a6
+; RV64I-NEXT: or s4, s0, s4
+; RV64I-NEXT: .LBB15_3:
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, s3, a4
+; RV64I-NEXT: lbu ra, 17(a0)
+; RV64I-NEXT: lbu s11, 18(a0)
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s5, 25(a0)
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: lbu s7, 26(a0)
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: slli s10, t6, 8
+; RV64I-NEXT: or s9, s2, a3
+; RV64I-NEXT: sub a4, a7, a6
+; RV64I-NEXT: mv a3, a5
+; RV64I-NEXT: beqz a6, .LBB15_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a3, s4
+; RV64I-NEXT: .LBB15_5:
+; RV64I-NEXT: slli t6, ra, 8
+; RV64I-NEXT: or t5, t5, s11
+; RV64I-NEXT: or t4, t4, s8
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: lbu s8, 16(a0)
+; RV64I-NEXT: lbu a0, 24(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s2, s1, s7
+; RV64I-NEXT: or s1, s10, s6
+; RV64I-NEXT: slli s4, s9, 16
+; RV64I-NEXT: bltu a4, t0, .LBB15_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: srl s0, a5, s0
+; RV64I-NEXT: j .LBB15_8
+; RV64I-NEXT: .LBB15_7:
+; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: sll s6, a5, s6
+; RV64I-NEXT: or s0, s0, s6
+; RV64I-NEXT: .LBB15_8:
+; RV64I-NEXT: or t6, t6, s8
+; RV64I-NEXT: slli s6, t5, 16
+; RV64I-NEXT: or s3, s3, t4
+; RV64I-NEXT: or t5, s5, a0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or s1, s4, s1
+; RV64I-NEXT: mv t4, t1
+; RV64I-NEXT: beqz a4, .LBB15_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t4, s0
+; RV64I-NEXT: .LBB15_10:
+; RV64I-NEXT: or a0, s6, t6
+; RV64I-NEXT: slli s0, s3, 32
+; RV64I-NEXT: or t6, s2, t5
+; RV64I-NEXT: slli s1, s1, 32
+; RV64I-NEXT: bltu a4, t0, .LBB15_12
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: j .LBB15_13
+; RV64I-NEXT: .LBB15_12:
+; RV64I-NEXT: srl t5, a5, t3
+; RV64I-NEXT: .LBB15_13:
+; RV64I-NEXT: or a4, s0, a0
+; RV64I-NEXT: or a0, s1, t6
+; RV64I-NEXT: bltu a6, t0, .LBB15_15
+; RV64I-NEXT: # %bb.14:
+; RV64I-NEXT: li t6, 0
+; RV64I-NEXT: sll t2, a4, t2
+; RV64I-NEXT: j .LBB15_16
+; RV64I-NEXT: .LBB15_15:
+; RV64I-NEXT: sll t6, a4, a6
+; RV64I-NEXT: srl t2, a4, t3
+; RV64I-NEXT: sll t3, a0, a6
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: .LBB15_16:
+; RV64I-NEXT: sub s0, a6, a7
+; RV64I-NEXT: mv t3, a0
+; RV64I-NEXT: beqz a6, .LBB15_18
+; RV64I-NEXT: # %bb.17:
+; RV64I-NEXT: mv t3, t2
+; RV64I-NEXT: .LBB15_18:
+; RV64I-NEXT: bltu s0, t0, .LBB15_20
+; RV64I-NEXT: # %bb.19:
+; RV64I-NEXT: li t2, 0
+; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: sll t0, t1, t0
+; RV64I-NEXT: bnez s0, .LBB15_21
+; RV64I-NEXT: j .LBB15_22
+; RV64I-NEXT: .LBB15_20:
+; RV64I-NEXT: sll t2, t1, s0
+; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: srl t0, t1, t0
+; RV64I-NEXT: sll t1, a5, s0
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: beqz s0, .LBB15_22
+; RV64I-NEXT: .LBB15_21:
+; RV64I-NEXT: mv a5, t0
+; RV64I-NEXT: .LBB15_22:
+; RV64I-NEXT: bltu a6, a7, .LBB15_24
+; RV64I-NEXT: # %bb.23:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a3, 0
+; RV64I-NEXT: bnez a6, .LBB15_25
+; RV64I-NEXT: j .LBB15_26
+; RV64I-NEXT: .LBB15_24:
+; RV64I-NEXT: or t2, t4, t6
+; RV64I-NEXT: or a5, t5, t3
+; RV64I-NEXT: beqz a6, .LBB15_26
+; RV64I-NEXT: .LBB15_25:
+; RV64I-NEXT: mv a4, t2
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB15_26:
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: srliw a6, a1, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a1, 24
+; RV64I-NEXT: srli t0, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a4, 32
+; RV64I-NEXT: srliw s2, a4, 16
+; RV64I-NEXT: srliw s6, a4, 24
+; RV64I-NEXT: srli s4, a4, 48
+; RV64I-NEXT: srli s7, a4, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a1, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a1, a5, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a1, a3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a1, a7, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a1, a4, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 112
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 5(a0)
+; RV32I-NEXT: lbu t2, 6(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a3
+; RV32I-NEXT: lbu a6, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: li s9, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: li t4, 32
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t3, a5, a4
+; RV32I-NEXT: or a5, t2, a7
+; RV32I-NEXT: or a4, a1, t0
+; RV32I-NEXT: slli a4, a4, 3
+; RV32I-NEXT: neg s10, a4
+; RV32I-NEXT: srl t5, t3, s10
+; RV32I-NEXT: sll s5, a5, a4
+; RV32I-NEXT: bltu a4, t4, .LBB15_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: sll a7, t3, a4
+; RV32I-NEXT: j .LBB15_3
+; RV32I-NEXT: .LBB15_2:
+; RV32I-NEXT: sll s8, t3, a4
+; RV32I-NEXT: or a7, t5, s5
+; RV32I-NEXT: .LBB15_3:
+; RV32I-NEXT: lbu t2, 9(a0)
+; RV32I-NEXT: lbu a1, 10(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t6, a3, 8
+; RV32I-NEXT: sub s6, s9, a4
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: beqz a4, .LBB15_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB15_5:
+; RV32I-NEXT: slli a7, t2, 8
+; RV32I-NEXT: or a6, a6, a1
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu a1, 12(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: neg t6, s6
+; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s6, t4, .LBB15_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t6, a5, s6
+; RV32I-NEXT: j .LBB15_8
+; RV32I-NEXT: .LBB15_7:
+; RV32I-NEXT: sll t6, a5, t6
+; RV32I-NEXT: or t6, t5, t6
+; RV32I-NEXT: .LBB15_8:
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t2, a6, 16
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: beqz s6, .LBB15_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a6, t6
+; RV32I-NEXT: .LBB15_10:
+; RV32I-NEXT: or t1, t2, a7
+; RV32I-NEXT: or t2, t0, a1
+; RV32I-NEXT: bltu s6, t4, .LBB15_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: j .LBB15_13
+; RV32I-NEXT: .LBB15_12:
+; RV32I-NEXT: srl a7, a5, s10
+; RV32I-NEXT: .LBB15_13:
+; RV32I-NEXT: srl s0, t1, s10
+; RV32I-NEXT: sll a1, t2, a4
+; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB15_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, a4
+; RV32I-NEXT: j .LBB15_16
+; RV32I-NEXT: .LBB15_15:
+; RV32I-NEXT: sll s1, t1, a4
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB15_16:
+; RV32I-NEXT: addi s7, a4, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz a4, .LBB15_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB15_18:
+; RV32I-NEXT: neg a1, s7
+; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t4, .LBB15_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: sll a1, t3, s7
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: bnez s7, .LBB15_21
+; RV32I-NEXT: j .LBB15_22
+; RV32I-NEXT: .LBB15_20:
+; RV32I-NEXT: sll s2, t3, a4
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: or a1, a1, s5
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: beqz s7, .LBB15_22
+; RV32I-NEXT: .LBB15_21:
+; RV32I-NEXT: mv s4, a1
+; RV32I-NEXT: .LBB15_22:
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bltu a4, s9, .LBB15_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: j .LBB15_25
+; RV32I-NEXT: .LBB15_24:
+; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, a6, s1
+; RV32I-NEXT: or s4, a7, s3
+; RV32I-NEXT: .LBB15_25:
+; RV32I-NEXT: sub ra, a1, a4
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: beqz a4, .LBB15_27
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: mv a7, s2
+; RV32I-NEXT: mv a6, s4
+; RV32I-NEXT: .LBB15_27:
+; RV32I-NEXT: neg s1, ra
+; RV32I-NEXT: sll s2, t2, s1
+; RV32I-NEXT: bltu ra, t4, .LBB15_29
+; RV32I-NEXT: # %bb.28:
+; RV32I-NEXT: srl a1, t2, ra
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bnez ra, .LBB15_30
+; RV32I-NEXT: j .LBB15_31
+; RV32I-NEXT: .LBB15_29:
+; RV32I-NEXT: or a1, s0, s2
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: beqz ra, .LBB15_31
+; RV32I-NEXT: .LBB15_30:
+; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB15_31:
+; RV32I-NEXT: bltu ra, t4, .LBB15_33
+; RV32I-NEXT: # %bb.32:
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: srl a1, a5, ra
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: bnez ra, .LBB15_34
+; RV32I-NEXT: j .LBB15_35
+; RV32I-NEXT: .LBB15_33:
+; RV32I-NEXT: srl a1, t2, s10
+; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, a5, s1
+; RV32I-NEXT: or a1, t5, a1
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: beqz ra, .LBB15_35
+; RV32I-NEXT: .LBB15_34:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB15_35:
+; RV32I-NEXT: sub s3, s9, ra
+; RV32I-NEXT: bltu ra, t4, .LBB15_38
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s3, t4, .LBB15_39
+; RV32I-NEXT: .LBB15_37:
+; RV32I-NEXT: sll s1, t1, s1
+; RV32I-NEXT: neg a1, s3
+; RV32I-NEXT: srl a1, t1, a1
+; RV32I-NEXT: or a1, a1, s2
+; RV32I-NEXT: j .LBB15_40
+; RV32I-NEXT: .LBB15_38:
+; RV32I-NEXT: srl a1, a5, s10
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s3, t4, .LBB15_37
+; RV32I-NEXT: .LBB15_39:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, s3
+; RV32I-NEXT: .LBB15_40:
+; RV32I-NEXT: addi s4, ra, -64
+; RV32I-NEXT: mv s2, t2
+; RV32I-NEXT: beqz s3, .LBB15_42
+; RV32I-NEXT: # %bb.41:
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: .LBB15_42:
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s5, a7
+; RV32I-NEXT: bltu s4, t4, .LBB15_44
+; RV32I-NEXT: # %bb.43:
+; RV32I-NEXT: srl t0, t2, s4
+; RV32I-NEXT: j .LBB15_45
+; RV32I-NEXT: .LBB15_44:
+; RV32I-NEXT: srl a1, t1, ra
+; RV32I-NEXT: neg t0, s4
+; RV32I-NEXT: sll t0, t2, t0
+; RV32I-NEXT: or t0, a1, t0
+; RV32I-NEXT: .LBB15_45:
+; RV32I-NEXT: mv s0, s10
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: lbu s8, 19(a0)
+; RV32I-NEXT: lbu a1, 23(a0)
+; RV32I-NEXT: mv s3, t1
+; RV32I-NEXT: beqz s4, .LBB15_47
+; RV32I-NEXT: # %bb.46:
+; RV32I-NEXT: mv s3, t0
+; RV32I-NEXT: .LBB15_47:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: lbu s10, 17(a0)
+; RV32I-NEXT: lbu t0, 18(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu t6, 22(a0)
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: li a3, 64
+; RV32I-NEXT: bltu s4, t4, .LBB15_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB15_50
+; RV32I-NEXT: .LBB15_49:
+; RV32I-NEXT: srl s4, t2, ra
+; RV32I-NEXT: .LBB15_50:
+; RV32I-NEXT: or s11, s8, t0
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t6, a1, t6
+; RV32I-NEXT: bgeu ra, a3, .LBB15_52
+; RV32I-NEXT: # %bb.51:
+; RV32I-NEXT: or s3, t5, s1
+; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s4, a1, s2
+; RV32I-NEXT: .LBB15_52:
+; RV32I-NEXT: or a1, s10, t0
+; RV32I-NEXT: slli s11, s11, 16
+; RV32I-NEXT: or t0, s9, s8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: mv s1, a5
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: beqz ra, .LBB15_54
+; RV32I-NEXT: # %bb.53:
+; RV32I-NEXT: mv t5, s3
+; RV32I-NEXT: mv s1, s4
+; RV32I-NEXT: .LBB15_54:
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, s11, a1
+; RV32I-NEXT: or s1, t6, t0
+; RV32I-NEXT: li a1, 64
+; RV32I-NEXT: mv a6, a7
+; RV32I-NEXT: mv a7, s0
+; RV32I-NEXT: bltu ra, a1, .LBB15_56
+; RV32I-NEXT: # %bb.55:
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB15_56:
+; RV32I-NEXT: srl s3, s2, a7
+; RV32I-NEXT: sll ra, s1, a4
+; RV32I-NEXT: mv a7, s5
+; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB15_58
+; RV32I-NEXT: # %bb.57:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: j .LBB15_59
+; RV32I-NEXT: .LBB15_58:
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a1, s3, ra
+; RV32I-NEXT: .LBB15_59:
+; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: lbu t6, 31(a0)
+; RV32I-NEXT: mv t5, s1
+; RV32I-NEXT: beqz a4, .LBB15_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB15_61:
+; RV32I-NEXT: lbu s8, 25(a0)
+; RV32I-NEXT: lbu s4, 26(a0)
+; RV32I-NEXT: lbu s11, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: bltu s6, t4, .LBB15_63
+; RV32I-NEXT: # %bb.62:
+; RV32I-NEXT: srl t0, s1, s6
+; RV32I-NEXT: j .LBB15_64
+; RV32I-NEXT: .LBB15_63:
+; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sll a1, s1, a1
+; RV32I-NEXT: or t0, s3, a1
+; RV32I-NEXT: .LBB15_64:
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: lbu s3, 24(a0)
+; RV32I-NEXT: lbu a1, 28(a0)
+; RV32I-NEXT: or s4, s9, s4
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t6, t6, s10
+; RV32I-NEXT: mv s9, s2
+; RV32I-NEXT: beqz s6, .LBB15_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: mv s9, t0
+; RV32I-NEXT: .LBB15_66:
+; RV32I-NEXT: or a0, s8, s3
+; RV32I-NEXT: slli t0, s4, 16
+; RV32I-NEXT: or a1, s11, a1
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: bltu s6, t4, .LBB15_68
+; RV32I-NEXT: # %bb.67:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB15_69
+; RV32I-NEXT: .LBB15_68:
+; RV32I-NEXT: srl s4, s1, s0
+; RV32I-NEXT: .LBB15_69:
+; RV32I-NEXT: li s11, 64
+; RV32I-NEXT: or s6, t0, a0
+; RV32I-NEXT: or a0, t6, a1
+; RV32I-NEXT: bltu a4, t4, .LBB15_71
+; RV32I-NEXT: # %bb.70:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a1, s6, a4
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: bnez a4, .LBB15_72
+; RV32I-NEXT: j .LBB15_73
+; RV32I-NEXT: .LBB15_71:
+; RV32I-NEXT: sll s3, s6, a4
+; RV32I-NEXT: srl a1, s6, s0
+; RV32I-NEXT: sll t0, a0, a4
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: beqz a4, .LBB15_73
+; RV32I-NEXT: .LBB15_72:
+; RV32I-NEXT: mv s10, a1
+; RV32I-NEXT: .LBB15_73:
+; RV32I-NEXT: bltu s7, t4, .LBB15_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, s2, s7
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: bnez s7, .LBB15_76
+; RV32I-NEXT: j .LBB15_77
+; RV32I-NEXT: .LBB15_75:
+; RV32I-NEXT: sll s5, s2, a4
+; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srl a1, s2, a1
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: beqz s7, .LBB15_77
+; RV32I-NEXT: .LBB15_76:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB15_77:
+; RV32I-NEXT: bltu a4, s11, .LBB15_79
+; RV32I-NEXT: # %bb.78:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB15_80
+; RV32I-NEXT: .LBB15_79:
+; RV32I-NEXT: or s5, s9, s3
+; RV32I-NEXT: or s0, s4, s10
+; RV32I-NEXT: .LBB15_80:
+; RV32I-NEXT: addi s9, a4, -128
+; RV32I-NEXT: mv s7, s6
+; RV32I-NEXT: mv s8, a0
+; RV32I-NEXT: beqz a4, .LBB15_82
+; RV32I-NEXT: # %bb.81:
+; RV32I-NEXT: mv s7, s5
+; RV32I-NEXT: mv s8, s0
+; RV32I-NEXT: .LBB15_82:
+; RV32I-NEXT: neg s3, s9
+; RV32I-NEXT: srl s0, t3, s3
+; RV32I-NEXT: bltu s9, t4, .LBB15_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, t3, s9
+; RV32I-NEXT: j .LBB15_85
+; RV32I-NEXT: .LBB15_84:
+; RV32I-NEXT: sll s5, t3, a4
+; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB15_85:
+; RV32I-NEXT: sub s4, s11, s9
+; RV32I-NEXT: mv t6, a5
+; RV32I-NEXT: beqz s9, .LBB15_87
+; RV32I-NEXT: # %bb.86:
+; RV32I-NEXT: mv t6, a1
+; RV32I-NEXT: .LBB15_87:
+; RV32I-NEXT: bltu s4, t4, .LBB15_89
+; RV32I-NEXT: # %bb.88:
+; RV32I-NEXT: srl a1, a5, s4
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: bnez s4, .LBB15_90
+; RV32I-NEXT: j .LBB15_91
+; RV32I-NEXT: .LBB15_89:
+; RV32I-NEXT: neg a1, s4
+; RV32I-NEXT: sll a1, a5, a1
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: beqz s4, .LBB15_91
+; RV32I-NEXT: .LBB15_90:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB15_91:
+; RV32I-NEXT: bltu s4, t4, .LBB15_94
+; RV32I-NEXT: # %bb.92:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bgeu s9, t4, .LBB15_95
+; RV32I-NEXT: .LBB15_93:
+; RV32I-NEXT: sll s10, t1, a4
+; RV32I-NEXT: srl a1, t1, s3
+; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: j .LBB15_96
+; RV32I-NEXT: .LBB15_94:
+; RV32I-NEXT: srl s4, a5, s3
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bltu s9, t4, .LBB15_93
+; RV32I-NEXT: .LBB15_95:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a1, t1, s9
+; RV32I-NEXT: .LBB15_96:
+; RV32I-NEXT: addi s11, s9, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz s9, .LBB15_98
+; RV32I-NEXT: # %bb.97:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB15_98:
+; RV32I-NEXT: bltu s11, t4, .LBB15_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: sll a1, t3, s11
+; RV32I-NEXT: bnez s11, .LBB15_101
+; RV32I-NEXT: j .LBB15_102
+; RV32I-NEXT: .LBB15_100:
+; RV32I-NEXT: sll t4, t3, s9
+; RV32I-NEXT: neg a1, s11
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: sll t0, a5, s9
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: beqz s11, .LBB15_102
+; RV32I-NEXT: .LBB15_101:
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: .LBB15_102:
+; RV32I-NEXT: bltu s9, ra, .LBB15_104
+; RV32I-NEXT: # %bb.103:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bnez s9, .LBB15_105
+; RV32I-NEXT: j .LBB15_106
+; RV32I-NEXT: .LBB15_104:
+; RV32I-NEXT: or t4, s0, s10
+; RV32I-NEXT: or a5, s4, s3
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: beqz s9, .LBB15_106
+; RV32I-NEXT: .LBB15_105:
+; RV32I-NEXT: mv t1, t4
+; RV32I-NEXT: mv t2, a5
+; RV32I-NEXT: .LBB15_106:
+; RV32I-NEXT: bltu a4, a1, .LBB15_108
+; RV32I-NEXT: # %bb.107:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: bnez a4, .LBB15_109
+; RV32I-NEXT: j .LBB15_110
+; RV32I-NEXT: .LBB15_108:
+; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s5, a1, a5
+; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t6, a1, t5
+; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a1, s7
+; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t2, a1, s8
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB15_110
+; RV32I-NEXT: .LBB15_109:
+; RV32I-NEXT: mv s2, s5
+; RV32I-NEXT: mv s1, t6
+; RV32I-NEXT: mv s6, t1
+; RV32I-NEXT: mv a0, t2
+; RV32I-NEXT: .LBB15_110:
+; RV32I-NEXT: srli a4, ra, 16
+; RV32I-NEXT: lui t2, 16
+; RV32I-NEXT: srli t1, ra, 24
+; RV32I-NEXT: srli a5, a3, 16
+; RV32I-NEXT: srli t4, a3, 24
+; RV32I-NEXT: srli t0, a7, 16
+; RV32I-NEXT: srli s0, a7, 24
+; RV32I-NEXT: srli t3, a6, 16
+; RV32I-NEXT: srli s3, a6, 24
+; RV32I-NEXT: srli t6, s2, 16
+; RV32I-NEXT: srli a1, s2, 24
+; RV32I-NEXT: srli t5, s1, 16
+; RV32I-NEXT: srli s5, s1, 24
+; RV32I-NEXT: srli s4, s6, 16
+; RV32I-NEXT: srli s7, s6, 24
+; RV32I-NEXT: srli s8, a0, 16
+; RV32I-NEXT: srli s9, a0, 24
+; RV32I-NEXT: addi t2, t2, -1
+; RV32I-NEXT: and s10, ra, t2
+; RV32I-NEXT: and s11, a3, t2
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb ra, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t1, 3(a2)
+; RV32I-NEXT: and a4, a7, t2
+; RV32I-NEXT: srli t1, s11, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb t1, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t4, 7(a2)
+; RV32I-NEXT: and a3, a6, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb t0, 10(a2)
+; RV32I-NEXT: sb s0, 11(a2)
+; RV32I-NEXT: and a4, s2, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a3, s1, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s2, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb t6, 18(a2)
+; RV32I-NEXT: sb a1, 19(a2)
+; RV32I-NEXT: and a1, s6, t2
+; RV32I-NEXT: and a4, a0, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s1, 20(a2)
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: sb t5, 22(a2)
+; RV32I-NEXT: sb s5, 23(a2)
+; RV32I-NEXT: sb s6, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s4, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb a0, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -112
+; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 12(a0)
+; RV64I-NEXT: lbu s0, 13(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: lbu s2, 15(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or t6, t6, t5
+; RV64I-NEXT: or s0, s0, a4
+; RV64I-NEXT: or s1, s2, s1
+; RV64I-NEXT: lbu a4, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu t5, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: or t1, t3, t2
+; RV64I-NEXT: or t2, t4, a4
+; RV64I-NEXT: or a1, a1, t5
+; RV64I-NEXT: lbu t5, 19(a0)
+; RV64I-NEXT: lbu t4, 21(a0)
+; RV64I-NEXT: lbu a4, 22(a0)
+; RV64I-NEXT: lbu t3, 23(a0)
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t6, t6, 16
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: or s4, s3, a3
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a6, t6, a6
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: lbu t6, 29(a0)
+; RV64I-NEXT: lbu a3, 30(a0)
+; RV64I-NEXT: lbu s2, 31(a0)
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or s5, t1, t0
+; RV64I-NEXT: li a7, 128
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli s3, t3, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t1, a5, s4
+; RV64I-NEXT: or a5, s0, a6
+; RV64I-NEXT: or a6, a1, s5
+; RV64I-NEXT: slli a6, a6, 5
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: srl s0, t1, t3
+; RV64I-NEXT: bltu a6, t0, .LBB16_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sll s4, t1, t2
+; RV64I-NEXT: j .LBB16_3
+; RV64I-NEXT: .LBB16_2:
+; RV64I-NEXT: sll a1, t1, a6
+; RV64I-NEXT: sll s4, a5, a6
+; RV64I-NEXT: or s4, s0, s4
+; RV64I-NEXT: .LBB16_3:
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, s3, a4
+; RV64I-NEXT: lbu ra, 17(a0)
+; RV64I-NEXT: lbu s11, 18(a0)
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s5, 25(a0)
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: lbu s7, 26(a0)
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: slli s10, t6, 8
+; RV64I-NEXT: or s9, s2, a3
+; RV64I-NEXT: sub a4, a7, a6
+; RV64I-NEXT: mv a3, a5
+; RV64I-NEXT: beqz a6, .LBB16_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a3, s4
+; RV64I-NEXT: .LBB16_5:
+; RV64I-NEXT: slli t6, ra, 8
+; RV64I-NEXT: or t5, t5, s11
+; RV64I-NEXT: or t4, t4, s8
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: lbu s8, 16(a0)
+; RV64I-NEXT: lbu a0, 24(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s2, s1, s7
+; RV64I-NEXT: or s1, s10, s6
+; RV64I-NEXT: slli s4, s9, 16
+; RV64I-NEXT: bltu a4, t0, .LBB16_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: srl s0, a5, s0
+; RV64I-NEXT: j .LBB16_8
+; RV64I-NEXT: .LBB16_7:
+; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: sll s6, a5, s6
+; RV64I-NEXT: or s0, s0, s6
+; RV64I-NEXT: .LBB16_8:
+; RV64I-NEXT: or t6, t6, s8
+; RV64I-NEXT: slli s6, t5, 16
+; RV64I-NEXT: or s3, s3, t4
+; RV64I-NEXT: or t5, s5, a0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or s1, s4, s1
+; RV64I-NEXT: mv t4, t1
+; RV64I-NEXT: beqz a4, .LBB16_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t4, s0
+; RV64I-NEXT: .LBB16_10:
+; RV64I-NEXT: or a0, s6, t6
+; RV64I-NEXT: slli s0, s3, 32
+; RV64I-NEXT: or t6, s2, t5
+; RV64I-NEXT: slli s1, s1, 32
+; RV64I-NEXT: bltu a4, t0, .LBB16_12
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: j .LBB16_13
+; RV64I-NEXT: .LBB16_12:
+; RV64I-NEXT: srl t5, a5, t3
+; RV64I-NEXT: .LBB16_13:
+; RV64I-NEXT: or a4, s0, a0
+; RV64I-NEXT: or a0, s1, t6
+; RV64I-NEXT: bltu a6, t0, .LBB16_15
+; RV64I-NEXT: # %bb.14:
+; RV64I-NEXT: li t6, 0
+; RV64I-NEXT: sll t2, a4, t2
+; RV64I-NEXT: j .LBB16_16
+; RV64I-NEXT: .LBB16_15:
+; RV64I-NEXT: sll t6, a4, a6
+; RV64I-NEXT: srl t2, a4, t3
+; RV64I-NEXT: sll t3, a0, a6
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: .LBB16_16:
+; RV64I-NEXT: sub s0, a6, a7
+; RV64I-NEXT: mv t3, a0
+; RV64I-NEXT: beqz a6, .LBB16_18
+; RV64I-NEXT: # %bb.17:
+; RV64I-NEXT: mv t3, t2
+; RV64I-NEXT: .LBB16_18:
+; RV64I-NEXT: bltu s0, t0, .LBB16_20
+; RV64I-NEXT: # %bb.19:
+; RV64I-NEXT: li t2, 0
+; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: sll t0, t1, t0
+; RV64I-NEXT: bnez s0, .LBB16_21
+; RV64I-NEXT: j .LBB16_22
+; RV64I-NEXT: .LBB16_20:
+; RV64I-NEXT: sll t2, t1, s0
+; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: srl t0, t1, t0
+; RV64I-NEXT: sll t1, a5, s0
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: beqz s0, .LBB16_22
+; RV64I-NEXT: .LBB16_21:
+; RV64I-NEXT: mv a5, t0
+; RV64I-NEXT: .LBB16_22:
+; RV64I-NEXT: bltu a6, a7, .LBB16_24
+; RV64I-NEXT: # %bb.23:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a3, 0
+; RV64I-NEXT: bnez a6, .LBB16_25
+; RV64I-NEXT: j .LBB16_26
+; RV64I-NEXT: .LBB16_24:
+; RV64I-NEXT: or t2, t4, t6
+; RV64I-NEXT: or a5, t5, t3
+; RV64I-NEXT: beqz a6, .LBB16_26
+; RV64I-NEXT: .LBB16_25:
+; RV64I-NEXT: mv a4, t2
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB16_26:
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: srliw a6, a1, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a1, 24
+; RV64I-NEXT: srli t0, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a4, 32
+; RV64I-NEXT: srliw s2, a4, 16
+; RV64I-NEXT: srliw s6, a4, 24
+; RV64I-NEXT: srli s4, a4, 48
+; RV64I-NEXT: srli s7, a4, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a1, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a1, a5, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a1, a3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a1, a7, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a1, a4, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 112
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 5(a0)
+; RV32I-NEXT: lbu t2, 6(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a3
+; RV32I-NEXT: lbu a6, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: li s9, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: li t4, 32
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t3, a5, a4
+; RV32I-NEXT: or a5, t2, a7
+; RV32I-NEXT: or a4, a1, t0
+; RV32I-NEXT: slli a4, a4, 5
+; RV32I-NEXT: neg s10, a4
+; RV32I-NEXT: srl t5, t3, s10
+; RV32I-NEXT: sll s5, a5, a4
+; RV32I-NEXT: bltu a4, t4, .LBB16_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: sll a7, t3, a4
+; RV32I-NEXT: j .LBB16_3
+; RV32I-NEXT: .LBB16_2:
+; RV32I-NEXT: sll s8, t3, a4
+; RV32I-NEXT: or a7, t5, s5
+; RV32I-NEXT: .LBB16_3:
+; RV32I-NEXT: lbu t2, 9(a0)
+; RV32I-NEXT: lbu a1, 10(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t6, a3, 8
+; RV32I-NEXT: sub s6, s9, a4
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: beqz a4, .LBB16_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB16_5:
+; RV32I-NEXT: slli a7, t2, 8
+; RV32I-NEXT: or a6, a6, a1
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu a1, 12(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: neg t6, s6
+; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s6, t4, .LBB16_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t6, a5, s6
+; RV32I-NEXT: j .LBB16_8
+; RV32I-NEXT: .LBB16_7:
+; RV32I-NEXT: sll t6, a5, t6
+; RV32I-NEXT: or t6, t5, t6
+; RV32I-NEXT: .LBB16_8:
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t2, a6, 16
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: beqz s6, .LBB16_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a6, t6
+; RV32I-NEXT: .LBB16_10:
+; RV32I-NEXT: or t1, t2, a7
+; RV32I-NEXT: or t2, t0, a1
+; RV32I-NEXT: bltu s6, t4, .LBB16_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: j .LBB16_13
+; RV32I-NEXT: .LBB16_12:
+; RV32I-NEXT: srl a7, a5, s10
+; RV32I-NEXT: .LBB16_13:
+; RV32I-NEXT: srl s0, t1, s10
+; RV32I-NEXT: sll a1, t2, a4
+; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB16_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, a4
+; RV32I-NEXT: j .LBB16_16
+; RV32I-NEXT: .LBB16_15:
+; RV32I-NEXT: sll s1, t1, a4
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB16_16:
+; RV32I-NEXT: addi s7, a4, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz a4, .LBB16_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB16_18:
+; RV32I-NEXT: neg a1, s7
+; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t4, .LBB16_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: sll a1, t3, s7
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: bnez s7, .LBB16_21
+; RV32I-NEXT: j .LBB16_22
+; RV32I-NEXT: .LBB16_20:
+; RV32I-NEXT: sll s2, t3, a4
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: or a1, a1, s5
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: beqz s7, .LBB16_22
+; RV32I-NEXT: .LBB16_21:
+; RV32I-NEXT: mv s4, a1
+; RV32I-NEXT: .LBB16_22:
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bltu a4, s9, .LBB16_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: j .LBB16_25
+; RV32I-NEXT: .LBB16_24:
+; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, a6, s1
+; RV32I-NEXT: or s4, a7, s3
+; RV32I-NEXT: .LBB16_25:
+; RV32I-NEXT: sub ra, a1, a4
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: beqz a4, .LBB16_27
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: mv a7, s2
+; RV32I-NEXT: mv a6, s4
+; RV32I-NEXT: .LBB16_27:
+; RV32I-NEXT: neg s1, ra
+; RV32I-NEXT: sll s2, t2, s1
+; RV32I-NEXT: bltu ra, t4, .LBB16_29
+; RV32I-NEXT: # %bb.28:
+; RV32I-NEXT: srl a1, t2, ra
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bnez ra, .LBB16_30
+; RV32I-NEXT: j .LBB16_31
+; RV32I-NEXT: .LBB16_29:
+; RV32I-NEXT: or a1, s0, s2
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: beqz ra, .LBB16_31
+; RV32I-NEXT: .LBB16_30:
+; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB16_31:
+; RV32I-NEXT: bltu ra, t4, .LBB16_33
+; RV32I-NEXT: # %bb.32:
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: srl a1, a5, ra
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: bnez ra, .LBB16_34
+; RV32I-NEXT: j .LBB16_35
+; RV32I-NEXT: .LBB16_33:
+; RV32I-NEXT: srl a1, t2, s10
+; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, a5, s1
+; RV32I-NEXT: or a1, t5, a1
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: beqz ra, .LBB16_35
+; RV32I-NEXT: .LBB16_34:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB16_35:
+; RV32I-NEXT: sub s3, s9, ra
+; RV32I-NEXT: bltu ra, t4, .LBB16_38
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s3, t4, .LBB16_39
+; RV32I-NEXT: .LBB16_37:
+; RV32I-NEXT: sll s1, t1, s1
+; RV32I-NEXT: neg a1, s3
+; RV32I-NEXT: srl a1, t1, a1
+; RV32I-NEXT: or a1, a1, s2
+; RV32I-NEXT: j .LBB16_40
+; RV32I-NEXT: .LBB16_38:
+; RV32I-NEXT: srl a1, a5, s10
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s3, t4, .LBB16_37
+; RV32I-NEXT: .LBB16_39:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, s3
+; RV32I-NEXT: .LBB16_40:
+; RV32I-NEXT: addi s4, ra, -64
+; RV32I-NEXT: mv s2, t2
+; RV32I-NEXT: beqz s3, .LBB16_42
+; RV32I-NEXT: # %bb.41:
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: .LBB16_42:
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s5, a7
+; RV32I-NEXT: bltu s4, t4, .LBB16_44
+; RV32I-NEXT: # %bb.43:
+; RV32I-NEXT: srl t0, t2, s4
+; RV32I-NEXT: j .LBB16_45
+; RV32I-NEXT: .LBB16_44:
+; RV32I-NEXT: srl a1, t1, ra
+; RV32I-NEXT: neg t0, s4
+; RV32I-NEXT: sll t0, t2, t0
+; RV32I-NEXT: or t0, a1, t0
+; RV32I-NEXT: .LBB16_45:
+; RV32I-NEXT: mv s0, s10
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: lbu s8, 19(a0)
+; RV32I-NEXT: lbu a1, 23(a0)
+; RV32I-NEXT: mv s3, t1
+; RV32I-NEXT: beqz s4, .LBB16_47
+; RV32I-NEXT: # %bb.46:
+; RV32I-NEXT: mv s3, t0
+; RV32I-NEXT: .LBB16_47:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: lbu s10, 17(a0)
+; RV32I-NEXT: lbu t0, 18(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu t6, 22(a0)
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: li a3, 64
+; RV32I-NEXT: bltu s4, t4, .LBB16_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB16_50
+; RV32I-NEXT: .LBB16_49:
+; RV32I-NEXT: srl s4, t2, ra
+; RV32I-NEXT: .LBB16_50:
+; RV32I-NEXT: or s11, s8, t0
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t6, a1, t6
+; RV32I-NEXT: bgeu ra, a3, .LBB16_52
+; RV32I-NEXT: # %bb.51:
+; RV32I-NEXT: or s3, t5, s1
+; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s4, a1, s2
+; RV32I-NEXT: .LBB16_52:
+; RV32I-NEXT: or a1, s10, t0
+; RV32I-NEXT: slli s11, s11, 16
+; RV32I-NEXT: or t0, s9, s8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: mv s1, a5
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: beqz ra, .LBB16_54
+; RV32I-NEXT: # %bb.53:
+; RV32I-NEXT: mv t5, s3
+; RV32I-NEXT: mv s1, s4
+; RV32I-NEXT: .LBB16_54:
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, s11, a1
+; RV32I-NEXT: or s1, t6, t0
+; RV32I-NEXT: li a1, 64
+; RV32I-NEXT: mv a6, a7
+; RV32I-NEXT: mv a7, s0
+; RV32I-NEXT: bltu ra, a1, .LBB16_56
+; RV32I-NEXT: # %bb.55:
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB16_56:
+; RV32I-NEXT: srl s3, s2, a7
+; RV32I-NEXT: sll ra, s1, a4
+; RV32I-NEXT: mv a7, s5
+; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB16_58
+; RV32I-NEXT: # %bb.57:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: j .LBB16_59
+; RV32I-NEXT: .LBB16_58:
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a1, s3, ra
+; RV32I-NEXT: .LBB16_59:
+; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: lbu t6, 31(a0)
+; RV32I-NEXT: mv t5, s1
+; RV32I-NEXT: beqz a4, .LBB16_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB16_61:
+; RV32I-NEXT: lbu s8, 25(a0)
+; RV32I-NEXT: lbu s4, 26(a0)
+; RV32I-NEXT: lbu s11, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: bltu s6, t4, .LBB16_63
+; RV32I-NEXT: # %bb.62:
+; RV32I-NEXT: srl t0, s1, s6
+; RV32I-NEXT: j .LBB16_64
+; RV32I-NEXT: .LBB16_63:
+; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sll a1, s1, a1
+; RV32I-NEXT: or t0, s3, a1
+; RV32I-NEXT: .LBB16_64:
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: lbu s3, 24(a0)
+; RV32I-NEXT: lbu a1, 28(a0)
+; RV32I-NEXT: or s4, s9, s4
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t6, t6, s10
+; RV32I-NEXT: mv s9, s2
+; RV32I-NEXT: beqz s6, .LBB16_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: mv s9, t0
+; RV32I-NEXT: .LBB16_66:
+; RV32I-NEXT: or a0, s8, s3
+; RV32I-NEXT: slli t0, s4, 16
+; RV32I-NEXT: or a1, s11, a1
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: bltu s6, t4, .LBB16_68
+; RV32I-NEXT: # %bb.67:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB16_69
+; RV32I-NEXT: .LBB16_68:
+; RV32I-NEXT: srl s4, s1, s0
+; RV32I-NEXT: .LBB16_69:
+; RV32I-NEXT: li s11, 64
+; RV32I-NEXT: or s6, t0, a0
+; RV32I-NEXT: or a0, t6, a1
+; RV32I-NEXT: bltu a4, t4, .LBB16_71
+; RV32I-NEXT: # %bb.70:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a1, s6, a4
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: bnez a4, .LBB16_72
+; RV32I-NEXT: j .LBB16_73
+; RV32I-NEXT: .LBB16_71:
+; RV32I-NEXT: sll s3, s6, a4
+; RV32I-NEXT: srl a1, s6, s0
+; RV32I-NEXT: sll t0, a0, a4
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: beqz a4, .LBB16_73
+; RV32I-NEXT: .LBB16_72:
+; RV32I-NEXT: mv s10, a1
+; RV32I-NEXT: .LBB16_73:
+; RV32I-NEXT: bltu s7, t4, .LBB16_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, s2, s7
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: bnez s7, .LBB16_76
+; RV32I-NEXT: j .LBB16_77
+; RV32I-NEXT: .LBB16_75:
+; RV32I-NEXT: sll s5, s2, a4
+; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srl a1, s2, a1
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: beqz s7, .LBB16_77
+; RV32I-NEXT: .LBB16_76:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB16_77:
+; RV32I-NEXT: bltu a4, s11, .LBB16_79
+; RV32I-NEXT: # %bb.78:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB16_80
+; RV32I-NEXT: .LBB16_79:
+; RV32I-NEXT: or s5, s9, s3
+; RV32I-NEXT: or s0, s4, s10
+; RV32I-NEXT: .LBB16_80:
+; RV32I-NEXT: addi s9, a4, -128
+; RV32I-NEXT: mv s7, s6
+; RV32I-NEXT: mv s8, a0
+; RV32I-NEXT: beqz a4, .LBB16_82
+; RV32I-NEXT: # %bb.81:
+; RV32I-NEXT: mv s7, s5
+; RV32I-NEXT: mv s8, s0
+; RV32I-NEXT: .LBB16_82:
+; RV32I-NEXT: neg s3, s9
+; RV32I-NEXT: srl s0, t3, s3
+; RV32I-NEXT: bltu s9, t4, .LBB16_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, t3, s9
+; RV32I-NEXT: j .LBB16_85
+; RV32I-NEXT: .LBB16_84:
+; RV32I-NEXT: sll s5, t3, a4
+; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB16_85:
+; RV32I-NEXT: sub s4, s11, s9
+; RV32I-NEXT: mv t6, a5
+; RV32I-NEXT: beqz s9, .LBB16_87
+; RV32I-NEXT: # %bb.86:
+; RV32I-NEXT: mv t6, a1
+; RV32I-NEXT: .LBB16_87:
+; RV32I-NEXT: bltu s4, t4, .LBB16_89
+; RV32I-NEXT: # %bb.88:
+; RV32I-NEXT: srl a1, a5, s4
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: bnez s4, .LBB16_90
+; RV32I-NEXT: j .LBB16_91
+; RV32I-NEXT: .LBB16_89:
+; RV32I-NEXT: neg a1, s4
+; RV32I-NEXT: sll a1, a5, a1
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: beqz s4, .LBB16_91
+; RV32I-NEXT: .LBB16_90:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB16_91:
+; RV32I-NEXT: bltu s4, t4, .LBB16_94
+; RV32I-NEXT: # %bb.92:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bgeu s9, t4, .LBB16_95
+; RV32I-NEXT: .LBB16_93:
+; RV32I-NEXT: sll s10, t1, a4
+; RV32I-NEXT: srl a1, t1, s3
+; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: j .LBB16_96
+; RV32I-NEXT: .LBB16_94:
+; RV32I-NEXT: srl s4, a5, s3
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bltu s9, t4, .LBB16_93
+; RV32I-NEXT: .LBB16_95:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a1, t1, s9
+; RV32I-NEXT: .LBB16_96:
+; RV32I-NEXT: addi s11, s9, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz s9, .LBB16_98
+; RV32I-NEXT: # %bb.97:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB16_98:
+; RV32I-NEXT: bltu s11, t4, .LBB16_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: sll a1, t3, s11
+; RV32I-NEXT: bnez s11, .LBB16_101
+; RV32I-NEXT: j .LBB16_102
+; RV32I-NEXT: .LBB16_100:
+; RV32I-NEXT: sll t4, t3, s9
+; RV32I-NEXT: neg a1, s11
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: sll t0, a5, s9
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: beqz s11, .LBB16_102
+; RV32I-NEXT: .LBB16_101:
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: .LBB16_102:
+; RV32I-NEXT: bltu s9, ra, .LBB16_104
+; RV32I-NEXT: # %bb.103:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bnez s9, .LBB16_105
+; RV32I-NEXT: j .LBB16_106
+; RV32I-NEXT: .LBB16_104:
+; RV32I-NEXT: or t4, s0, s10
+; RV32I-NEXT: or a5, s4, s3
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: beqz s9, .LBB16_106
+; RV32I-NEXT: .LBB16_105:
+; RV32I-NEXT: mv t1, t4
+; RV32I-NEXT: mv t2, a5
+; RV32I-NEXT: .LBB16_106:
+; RV32I-NEXT: bltu a4, a1, .LBB16_108
+; RV32I-NEXT: # %bb.107:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: bnez a4, .LBB16_109
+; RV32I-NEXT: j .LBB16_110
+; RV32I-NEXT: .LBB16_108:
+; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s5, a1, a5
+; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t6, a1, t5
+; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a1, s7
+; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t2, a1, s8
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB16_110
+; RV32I-NEXT: .LBB16_109:
+; RV32I-NEXT: mv s2, s5
+; RV32I-NEXT: mv s1, t6
+; RV32I-NEXT: mv s6, t1
+; RV32I-NEXT: mv a0, t2
+; RV32I-NEXT: .LBB16_110:
+; RV32I-NEXT: srli a4, ra, 16
+; RV32I-NEXT: lui t2, 16
+; RV32I-NEXT: srli t1, ra, 24
+; RV32I-NEXT: srli a5, a3, 16
+; RV32I-NEXT: srli t4, a3, 24
+; RV32I-NEXT: srli t0, a7, 16
+; RV32I-NEXT: srli s0, a7, 24
+; RV32I-NEXT: srli t3, a6, 16
+; RV32I-NEXT: srli s3, a6, 24
+; RV32I-NEXT: srli t6, s2, 16
+; RV32I-NEXT: srli a1, s2, 24
+; RV32I-NEXT: srli t5, s1, 16
+; RV32I-NEXT: srli s5, s1, 24
+; RV32I-NEXT: srli s4, s6, 16
+; RV32I-NEXT: srli s7, s6, 24
+; RV32I-NEXT: srli s8, a0, 16
+; RV32I-NEXT: srli s9, a0, 24
+; RV32I-NEXT: addi t2, t2, -1
+; RV32I-NEXT: and s10, ra, t2
+; RV32I-NEXT: and s11, a3, t2
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb ra, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t1, 3(a2)
+; RV32I-NEXT: and a4, a7, t2
+; RV32I-NEXT: srli t1, s11, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb t1, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t4, 7(a2)
+; RV32I-NEXT: and a3, a6, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb t0, 10(a2)
+; RV32I-NEXT: sb s0, 11(a2)
+; RV32I-NEXT: and a4, s2, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a3, s1, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s2, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb t6, 18(a2)
+; RV32I-NEXT: sb a1, 19(a2)
+; RV32I-NEXT: and a1, s6, t2
+; RV32I-NEXT: and a4, a0, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s1, 20(a2)
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: sb t5, 22(a2)
+; RV32I-NEXT: sb s5, 23(a2)
+; RV32I-NEXT: sb s6, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s4, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb a0, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -112
+; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 12(a0)
+; RV64I-NEXT: lbu s0, 13(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: lbu s2, 15(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or t6, t6, t5
+; RV64I-NEXT: or s0, s0, a4
+; RV64I-NEXT: or s1, s2, s1
+; RV64I-NEXT: lbu a4, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu t5, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: or t1, t3, t2
+; RV64I-NEXT: or t2, t4, a4
+; RV64I-NEXT: or a1, a1, t5
+; RV64I-NEXT: lbu t5, 19(a0)
+; RV64I-NEXT: lbu t4, 21(a0)
+; RV64I-NEXT: lbu a4, 22(a0)
+; RV64I-NEXT: lbu t3, 23(a0)
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t6, t6, 16
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: or s4, s3, a3
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a6, t6, a6
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: lbu t6, 29(a0)
+; RV64I-NEXT: lbu a3, 30(a0)
+; RV64I-NEXT: lbu s2, 31(a0)
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or s5, t1, t0
+; RV64I-NEXT: li a7, 128
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli s3, t3, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t1, a5, s4
+; RV64I-NEXT: or a5, s0, a6
+; RV64I-NEXT: or a6, a1, s5
+; RV64I-NEXT: slli a6, a6, 6
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: srl s0, t1, t3
+; RV64I-NEXT: bltu a6, t0, .LBB17_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sll s4, t1, t2
+; RV64I-NEXT: j .LBB17_3
+; RV64I-NEXT: .LBB17_2:
+; RV64I-NEXT: sll a1, t1, a6
+; RV64I-NEXT: sll s4, a5, a6
+; RV64I-NEXT: or s4, s0, s4
+; RV64I-NEXT: .LBB17_3:
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, s3, a4
+; RV64I-NEXT: lbu ra, 17(a0)
+; RV64I-NEXT: lbu s11, 18(a0)
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s5, 25(a0)
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: lbu s7, 26(a0)
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: slli s10, t6, 8
+; RV64I-NEXT: or s9, s2, a3
+; RV64I-NEXT: sub a4, a7, a6
+; RV64I-NEXT: mv a3, a5
+; RV64I-NEXT: beqz a6, .LBB17_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a3, s4
+; RV64I-NEXT: .LBB17_5:
+; RV64I-NEXT: slli t6, ra, 8
+; RV64I-NEXT: or t5, t5, s11
+; RV64I-NEXT: or t4, t4, s8
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: lbu s8, 16(a0)
+; RV64I-NEXT: lbu a0, 24(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s2, s1, s7
+; RV64I-NEXT: or s1, s10, s6
+; RV64I-NEXT: slli s4, s9, 16
+; RV64I-NEXT: bltu a4, t0, .LBB17_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: srl s0, a5, s0
+; RV64I-NEXT: j .LBB17_8
+; RV64I-NEXT: .LBB17_7:
+; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: sll s6, a5, s6
+; RV64I-NEXT: or s0, s0, s6
+; RV64I-NEXT: .LBB17_8:
+; RV64I-NEXT: or t6, t6, s8
+; RV64I-NEXT: slli s6, t5, 16
+; RV64I-NEXT: or s3, s3, t4
+; RV64I-NEXT: or t5, s5, a0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or s1, s4, s1
+; RV64I-NEXT: mv t4, t1
+; RV64I-NEXT: beqz a4, .LBB17_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t4, s0
+; RV64I-NEXT: .LBB17_10:
+; RV64I-NEXT: or a0, s6, t6
+; RV64I-NEXT: slli s0, s3, 32
+; RV64I-NEXT: or t6, s2, t5
+; RV64I-NEXT: slli s1, s1, 32
+; RV64I-NEXT: bltu a4, t0, .LBB17_12
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: j .LBB17_13
+; RV64I-NEXT: .LBB17_12:
+; RV64I-NEXT: srl t5, a5, t3
+; RV64I-NEXT: .LBB17_13:
+; RV64I-NEXT: or a4, s0, a0
+; RV64I-NEXT: or a0, s1, t6
+; RV64I-NEXT: bltu a6, t0, .LBB17_15
+; RV64I-NEXT: # %bb.14:
+; RV64I-NEXT: li t6, 0
+; RV64I-NEXT: sll t2, a4, t2
+; RV64I-NEXT: j .LBB17_16
+; RV64I-NEXT: .LBB17_15:
+; RV64I-NEXT: sll t6, a4, a6
+; RV64I-NEXT: srl t2, a4, t3
+; RV64I-NEXT: sll t3, a0, a6
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: .LBB17_16:
+; RV64I-NEXT: sub s0, a6, a7
+; RV64I-NEXT: mv t3, a0
+; RV64I-NEXT: beqz a6, .LBB17_18
+; RV64I-NEXT: # %bb.17:
+; RV64I-NEXT: mv t3, t2
+; RV64I-NEXT: .LBB17_18:
+; RV64I-NEXT: bltu s0, t0, .LBB17_20
+; RV64I-NEXT: # %bb.19:
+; RV64I-NEXT: li t2, 0
+; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: sll t0, t1, t0
+; RV64I-NEXT: bnez s0, .LBB17_21
+; RV64I-NEXT: j .LBB17_22
+; RV64I-NEXT: .LBB17_20:
+; RV64I-NEXT: sll t2, t1, s0
+; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: srl t0, t1, t0
+; RV64I-NEXT: sll t1, a5, s0
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: beqz s0, .LBB17_22
+; RV64I-NEXT: .LBB17_21:
+; RV64I-NEXT: mv a5, t0
+; RV64I-NEXT: .LBB17_22:
+; RV64I-NEXT: bltu a6, a7, .LBB17_24
+; RV64I-NEXT: # %bb.23:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a3, 0
+; RV64I-NEXT: bnez a6, .LBB17_25
+; RV64I-NEXT: j .LBB17_26
+; RV64I-NEXT: .LBB17_24:
+; RV64I-NEXT: or t2, t4, t6
+; RV64I-NEXT: or a5, t5, t3
+; RV64I-NEXT: beqz a6, .LBB17_26
+; RV64I-NEXT: .LBB17_25:
+; RV64I-NEXT: mv a4, t2
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB17_26:
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: srliw a6, a1, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a1, 24
+; RV64I-NEXT: srli t0, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a4, 32
+; RV64I-NEXT: srliw s2, a4, 16
+; RV64I-NEXT: srliw s6, a4, 24
+; RV64I-NEXT: srli s4, a4, 48
+; RV64I-NEXT: srli s7, a4, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a1, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a1, a5, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a1, a3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a1, a7, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a1, a4, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 112
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 5(a0)
+; RV32I-NEXT: lbu t2, 6(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a3
+; RV32I-NEXT: lbu a6, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: li s9, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: li t4, 32
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t3, a5, a4
+; RV32I-NEXT: or a5, t2, a7
+; RV32I-NEXT: or a4, a1, t0
+; RV32I-NEXT: slli a4, a4, 6
+; RV32I-NEXT: neg s10, a4
+; RV32I-NEXT: srl t5, t3, s10
+; RV32I-NEXT: sll s5, a5, a4
+; RV32I-NEXT: bltu a4, t4, .LBB17_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: sll a7, t3, a4
+; RV32I-NEXT: j .LBB17_3
+; RV32I-NEXT: .LBB17_2:
+; RV32I-NEXT: sll s8, t3, a4
+; RV32I-NEXT: or a7, t5, s5
+; RV32I-NEXT: .LBB17_3:
+; RV32I-NEXT: lbu t2, 9(a0)
+; RV32I-NEXT: lbu a1, 10(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t6, a3, 8
+; RV32I-NEXT: sub s6, s9, a4
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: beqz a4, .LBB17_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB17_5:
+; RV32I-NEXT: slli a7, t2, 8
+; RV32I-NEXT: or a6, a6, a1
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu a1, 12(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: neg t6, s6
+; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s6, t4, .LBB17_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t6, a5, s6
+; RV32I-NEXT: j .LBB17_8
+; RV32I-NEXT: .LBB17_7:
+; RV32I-NEXT: sll t6, a5, t6
+; RV32I-NEXT: or t6, t5, t6
+; RV32I-NEXT: .LBB17_8:
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t2, a6, 16
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: beqz s6, .LBB17_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a6, t6
+; RV32I-NEXT: .LBB17_10:
+; RV32I-NEXT: or t1, t2, a7
+; RV32I-NEXT: or t2, t0, a1
+; RV32I-NEXT: bltu s6, t4, .LBB17_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: j .LBB17_13
+; RV32I-NEXT: .LBB17_12:
+; RV32I-NEXT: srl a7, a5, s10
+; RV32I-NEXT: .LBB17_13:
+; RV32I-NEXT: srl s0, t1, s10
+; RV32I-NEXT: sll a1, t2, a4
+; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB17_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, a4
+; RV32I-NEXT: j .LBB17_16
+; RV32I-NEXT: .LBB17_15:
+; RV32I-NEXT: sll s1, t1, a4
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB17_16:
+; RV32I-NEXT: addi s7, a4, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz a4, .LBB17_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB17_18:
+; RV32I-NEXT: neg a1, s7
+; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t4, .LBB17_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: sll a1, t3, s7
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: bnez s7, .LBB17_21
+; RV32I-NEXT: j .LBB17_22
+; RV32I-NEXT: .LBB17_20:
+; RV32I-NEXT: sll s2, t3, a4
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: or a1, a1, s5
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: beqz s7, .LBB17_22
+; RV32I-NEXT: .LBB17_21:
+; RV32I-NEXT: mv s4, a1
+; RV32I-NEXT: .LBB17_22:
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bltu a4, s9, .LBB17_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: j .LBB17_25
+; RV32I-NEXT: .LBB17_24:
+; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, a6, s1
+; RV32I-NEXT: or s4, a7, s3
+; RV32I-NEXT: .LBB17_25:
+; RV32I-NEXT: sub ra, a1, a4
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: beqz a4, .LBB17_27
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: mv a7, s2
+; RV32I-NEXT: mv a6, s4
+; RV32I-NEXT: .LBB17_27:
+; RV32I-NEXT: neg s1, ra
+; RV32I-NEXT: sll s2, t2, s1
+; RV32I-NEXT: bltu ra, t4, .LBB17_29
+; RV32I-NEXT: # %bb.28:
+; RV32I-NEXT: srl a1, t2, ra
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bnez ra, .LBB17_30
+; RV32I-NEXT: j .LBB17_31
+; RV32I-NEXT: .LBB17_29:
+; RV32I-NEXT: or a1, s0, s2
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: beqz ra, .LBB17_31
+; RV32I-NEXT: .LBB17_30:
+; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB17_31:
+; RV32I-NEXT: bltu ra, t4, .LBB17_33
+; RV32I-NEXT: # %bb.32:
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: srl a1, a5, ra
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: bnez ra, .LBB17_34
+; RV32I-NEXT: j .LBB17_35
+; RV32I-NEXT: .LBB17_33:
+; RV32I-NEXT: srl a1, t2, s10
+; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, a5, s1
+; RV32I-NEXT: or a1, t5, a1
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: beqz ra, .LBB17_35
+; RV32I-NEXT: .LBB17_34:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB17_35:
+; RV32I-NEXT: sub s3, s9, ra
+; RV32I-NEXT: bltu ra, t4, .LBB17_38
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s3, t4, .LBB17_39
+; RV32I-NEXT: .LBB17_37:
+; RV32I-NEXT: sll s1, t1, s1
+; RV32I-NEXT: neg a1, s3
+; RV32I-NEXT: srl a1, t1, a1
+; RV32I-NEXT: or a1, a1, s2
+; RV32I-NEXT: j .LBB17_40
+; RV32I-NEXT: .LBB17_38:
+; RV32I-NEXT: srl a1, a5, s10
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s3, t4, .LBB17_37
+; RV32I-NEXT: .LBB17_39:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, s3
+; RV32I-NEXT: .LBB17_40:
+; RV32I-NEXT: addi s4, ra, -64
+; RV32I-NEXT: mv s2, t2
+; RV32I-NEXT: beqz s3, .LBB17_42
+; RV32I-NEXT: # %bb.41:
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: .LBB17_42:
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s5, a7
+; RV32I-NEXT: bltu s4, t4, .LBB17_44
+; RV32I-NEXT: # %bb.43:
+; RV32I-NEXT: srl t0, t2, s4
+; RV32I-NEXT: j .LBB17_45
+; RV32I-NEXT: .LBB17_44:
+; RV32I-NEXT: srl a1, t1, ra
+; RV32I-NEXT: neg t0, s4
+; RV32I-NEXT: sll t0, t2, t0
+; RV32I-NEXT: or t0, a1, t0
+; RV32I-NEXT: .LBB17_45:
+; RV32I-NEXT: mv s0, s10
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: lbu s8, 19(a0)
+; RV32I-NEXT: lbu a1, 23(a0)
+; RV32I-NEXT: mv s3, t1
+; RV32I-NEXT: beqz s4, .LBB17_47
+; RV32I-NEXT: # %bb.46:
+; RV32I-NEXT: mv s3, t0
+; RV32I-NEXT: .LBB17_47:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: lbu s10, 17(a0)
+; RV32I-NEXT: lbu t0, 18(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu t6, 22(a0)
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: li a3, 64
+; RV32I-NEXT: bltu s4, t4, .LBB17_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB17_50
+; RV32I-NEXT: .LBB17_49:
+; RV32I-NEXT: srl s4, t2, ra
+; RV32I-NEXT: .LBB17_50:
+; RV32I-NEXT: or s11, s8, t0
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t6, a1, t6
+; RV32I-NEXT: bgeu ra, a3, .LBB17_52
+; RV32I-NEXT: # %bb.51:
+; RV32I-NEXT: or s3, t5, s1
+; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s4, a1, s2
+; RV32I-NEXT: .LBB17_52:
+; RV32I-NEXT: or a1, s10, t0
+; RV32I-NEXT: slli s11, s11, 16
+; RV32I-NEXT: or t0, s9, s8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: mv s1, a5
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: beqz ra, .LBB17_54
+; RV32I-NEXT: # %bb.53:
+; RV32I-NEXT: mv t5, s3
+; RV32I-NEXT: mv s1, s4
+; RV32I-NEXT: .LBB17_54:
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, s11, a1
+; RV32I-NEXT: or s1, t6, t0
+; RV32I-NEXT: li a1, 64
+; RV32I-NEXT: mv a6, a7
+; RV32I-NEXT: mv a7, s0
+; RV32I-NEXT: bltu ra, a1, .LBB17_56
+; RV32I-NEXT: # %bb.55:
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB17_56:
+; RV32I-NEXT: srl s3, s2, a7
+; RV32I-NEXT: sll ra, s1, a4
+; RV32I-NEXT: mv a7, s5
+; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB17_58
+; RV32I-NEXT: # %bb.57:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: j .LBB17_59
+; RV32I-NEXT: .LBB17_58:
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a1, s3, ra
+; RV32I-NEXT: .LBB17_59:
+; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: lbu t6, 31(a0)
+; RV32I-NEXT: mv t5, s1
+; RV32I-NEXT: beqz a4, .LBB17_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB17_61:
+; RV32I-NEXT: lbu s8, 25(a0)
+; RV32I-NEXT: lbu s4, 26(a0)
+; RV32I-NEXT: lbu s11, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: bltu s6, t4, .LBB17_63
+; RV32I-NEXT: # %bb.62:
+; RV32I-NEXT: srl t0, s1, s6
+; RV32I-NEXT: j .LBB17_64
+; RV32I-NEXT: .LBB17_63:
+; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sll a1, s1, a1
+; RV32I-NEXT: or t0, s3, a1
+; RV32I-NEXT: .LBB17_64:
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: lbu s3, 24(a0)
+; RV32I-NEXT: lbu a1, 28(a0)
+; RV32I-NEXT: or s4, s9, s4
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t6, t6, s10
+; RV32I-NEXT: mv s9, s2
+; RV32I-NEXT: beqz s6, .LBB17_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: mv s9, t0
+; RV32I-NEXT: .LBB17_66:
+; RV32I-NEXT: or a0, s8, s3
+; RV32I-NEXT: slli t0, s4, 16
+; RV32I-NEXT: or a1, s11, a1
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: bltu s6, t4, .LBB17_68
+; RV32I-NEXT: # %bb.67:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB17_69
+; RV32I-NEXT: .LBB17_68:
+; RV32I-NEXT: srl s4, s1, s0
+; RV32I-NEXT: .LBB17_69:
+; RV32I-NEXT: li s11, 64
+; RV32I-NEXT: or s6, t0, a0
+; RV32I-NEXT: or a0, t6, a1
+; RV32I-NEXT: bltu a4, t4, .LBB17_71
+; RV32I-NEXT: # %bb.70:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a1, s6, a4
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: bnez a4, .LBB17_72
+; RV32I-NEXT: j .LBB17_73
+; RV32I-NEXT: .LBB17_71:
+; RV32I-NEXT: sll s3, s6, a4
+; RV32I-NEXT: srl a1, s6, s0
+; RV32I-NEXT: sll t0, a0, a4
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: beqz a4, .LBB17_73
+; RV32I-NEXT: .LBB17_72:
+; RV32I-NEXT: mv s10, a1
+; RV32I-NEXT: .LBB17_73:
+; RV32I-NEXT: bltu s7, t4, .LBB17_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, s2, s7
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: bnez s7, .LBB17_76
+; RV32I-NEXT: j .LBB17_77
+; RV32I-NEXT: .LBB17_75:
+; RV32I-NEXT: sll s5, s2, a4
+; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srl a1, s2, a1
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: beqz s7, .LBB17_77
+; RV32I-NEXT: .LBB17_76:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB17_77:
+; RV32I-NEXT: bltu a4, s11, .LBB17_79
+; RV32I-NEXT: # %bb.78:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB17_80
+; RV32I-NEXT: .LBB17_79:
+; RV32I-NEXT: or s5, s9, s3
+; RV32I-NEXT: or s0, s4, s10
+; RV32I-NEXT: .LBB17_80:
+; RV32I-NEXT: addi s9, a4, -128
+; RV32I-NEXT: mv s7, s6
+; RV32I-NEXT: mv s8, a0
+; RV32I-NEXT: beqz a4, .LBB17_82
+; RV32I-NEXT: # %bb.81:
+; RV32I-NEXT: mv s7, s5
+; RV32I-NEXT: mv s8, s0
+; RV32I-NEXT: .LBB17_82:
+; RV32I-NEXT: neg s3, s9
+; RV32I-NEXT: srl s0, t3, s3
+; RV32I-NEXT: bltu s9, t4, .LBB17_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, t3, s9
+; RV32I-NEXT: j .LBB17_85
+; RV32I-NEXT: .LBB17_84:
+; RV32I-NEXT: sll s5, t3, a4
+; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB17_85:
+; RV32I-NEXT: sub s4, s11, s9
+; RV32I-NEXT: mv t6, a5
+; RV32I-NEXT: beqz s9, .LBB17_87
+; RV32I-NEXT: # %bb.86:
+; RV32I-NEXT: mv t6, a1
+; RV32I-NEXT: .LBB17_87:
+; RV32I-NEXT: bltu s4, t4, .LBB17_89
+; RV32I-NEXT: # %bb.88:
+; RV32I-NEXT: srl a1, a5, s4
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: bnez s4, .LBB17_90
+; RV32I-NEXT: j .LBB17_91
+; RV32I-NEXT: .LBB17_89:
+; RV32I-NEXT: neg a1, s4
+; RV32I-NEXT: sll a1, a5, a1
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: beqz s4, .LBB17_91
+; RV32I-NEXT: .LBB17_90:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB17_91:
+; RV32I-NEXT: bltu s4, t4, .LBB17_94
+; RV32I-NEXT: # %bb.92:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bgeu s9, t4, .LBB17_95
+; RV32I-NEXT: .LBB17_93:
+; RV32I-NEXT: sll s10, t1, a4
+; RV32I-NEXT: srl a1, t1, s3
+; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: j .LBB17_96
+; RV32I-NEXT: .LBB17_94:
+; RV32I-NEXT: srl s4, a5, s3
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bltu s9, t4, .LBB17_93
+; RV32I-NEXT: .LBB17_95:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a1, t1, s9
+; RV32I-NEXT: .LBB17_96:
+; RV32I-NEXT: addi s11, s9, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz s9, .LBB17_98
+; RV32I-NEXT: # %bb.97:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB17_98:
+; RV32I-NEXT: bltu s11, t4, .LBB17_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: sll a1, t3, s11
+; RV32I-NEXT: bnez s11, .LBB17_101
+; RV32I-NEXT: j .LBB17_102
+; RV32I-NEXT: .LBB17_100:
+; RV32I-NEXT: sll t4, t3, s9
+; RV32I-NEXT: neg a1, s11
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: sll t0, a5, s9
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: beqz s11, .LBB17_102
+; RV32I-NEXT: .LBB17_101:
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: .LBB17_102:
+; RV32I-NEXT: bltu s9, ra, .LBB17_104
+; RV32I-NEXT: # %bb.103:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bnez s9, .LBB17_105
+; RV32I-NEXT: j .LBB17_106
+; RV32I-NEXT: .LBB17_104:
+; RV32I-NEXT: or t4, s0, s10
+; RV32I-NEXT: or a5, s4, s3
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: beqz s9, .LBB17_106
+; RV32I-NEXT: .LBB17_105:
+; RV32I-NEXT: mv t1, t4
+; RV32I-NEXT: mv t2, a5
+; RV32I-NEXT: .LBB17_106:
+; RV32I-NEXT: bltu a4, a1, .LBB17_108
+; RV32I-NEXT: # %bb.107:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: bnez a4, .LBB17_109
+; RV32I-NEXT: j .LBB17_110
+; RV32I-NEXT: .LBB17_108:
+; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s5, a1, a5
+; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t6, a1, t5
+; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a1, s7
+; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t2, a1, s8
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB17_110
+; RV32I-NEXT: .LBB17_109:
+; RV32I-NEXT: mv s2, s5
+; RV32I-NEXT: mv s1, t6
+; RV32I-NEXT: mv s6, t1
+; RV32I-NEXT: mv a0, t2
+; RV32I-NEXT: .LBB17_110:
+; RV32I-NEXT: srli a4, ra, 16
+; RV32I-NEXT: lui t2, 16
+; RV32I-NEXT: srli t1, ra, 24
+; RV32I-NEXT: srli a5, a3, 16
+; RV32I-NEXT: srli t4, a3, 24
+; RV32I-NEXT: srli t0, a7, 16
+; RV32I-NEXT: srli s0, a7, 24
+; RV32I-NEXT: srli t3, a6, 16
+; RV32I-NEXT: srli s3, a6, 24
+; RV32I-NEXT: srli t6, s2, 16
+; RV32I-NEXT: srli a1, s2, 24
+; RV32I-NEXT: srli t5, s1, 16
+; RV32I-NEXT: srli s5, s1, 24
+; RV32I-NEXT: srli s4, s6, 16
+; RV32I-NEXT: srli s7, s6, 24
+; RV32I-NEXT: srli s8, a0, 16
+; RV32I-NEXT: srli s9, a0, 24
+; RV32I-NEXT: addi t2, t2, -1
+; RV32I-NEXT: and s10, ra, t2
+; RV32I-NEXT: and s11, a3, t2
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb ra, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t1, 3(a2)
+; RV32I-NEXT: and a4, a7, t2
+; RV32I-NEXT: srli t1, s11, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb t1, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t4, 7(a2)
+; RV32I-NEXT: and a3, a6, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb t0, 10(a2)
+; RV32I-NEXT: sb s0, 11(a2)
+; RV32I-NEXT: and a4, s2, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a3, s1, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s2, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb t6, 18(a2)
+; RV32I-NEXT: sb a1, 19(a2)
+; RV32I-NEXT: and a1, s6, t2
+; RV32I-NEXT: and a4, a0, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s1, 20(a2)
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: sb t5, 22(a2)
+; RV32I-NEXT: sb s5, 23(a2)
+; RV32I-NEXT: sb s6, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s4, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb a0, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or a7, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t4, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli t3, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a7, t1, a7
+; RV64I-NEXT: or a5, t5, t4
+; RV64I-NEXT: or a6, s0, t6
+; RV64I-NEXT: slli a6, a6, 3
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sll t4, a5, t5
+; RV64I-NEXT: bltu a6, t0, .LBB18_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sra t6, a5, t1
+; RV64I-NEXT: j .LBB18_3
+; RV64I-NEXT: .LBB18_2:
+; RV64I-NEXT: srl t6, a7, a6
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: .LBB18_3:
+; RV64I-NEXT: or a3, t3, a3
+; RV64I-NEXT: slli t3, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a7
+; RV64I-NEXT: beqz a6, .LBB18_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB18_5:
+; RV64I-NEXT: or a4, t3, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a6, t0, .LBB18_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: srai a0, a5, 63
+; RV64I-NEXT: srl t3, a3, t1
+; RV64I-NEXT: j .LBB18_8
+; RV64I-NEXT: .LBB18_7:
+; RV64I-NEXT: sra a0, a5, a6
+; RV64I-NEXT: srl t1, a4, a6
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t3, t1, t2
+; RV64I-NEXT: .LBB18_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a6, .LBB18_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t3
+; RV64I-NEXT: .LBB18_10:
+; RV64I-NEXT: sub t6, t1, a6
+; RV64I-NEXT: bltu a6, t0, .LBB18_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t3, 0
+; RV64I-NEXT: bgeu t6, t0, .LBB18_14
+; RV64I-NEXT: .LBB18_12:
+; RV64I-NEXT: sll t5, a7, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a7, s0
+; RV64I-NEXT: or s1, s0, t4
+; RV64I-NEXT: j .LBB18_15
+; RV64I-NEXT: .LBB18_13:
+; RV64I-NEXT: srl t3, a3, a6
+; RV64I-NEXT: bltu t6, t0, .LBB18_12
+; RV64I-NEXT: .LBB18_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: sll s1, a7, t4
+; RV64I-NEXT: .LBB18_15:
+; RV64I-NEXT: sub s0, a6, t1
+; RV64I-NEXT: mv t4, a5
+; RV64I-NEXT: beqz t6, .LBB18_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t4, s1
+; RV64I-NEXT: .LBB18_17:
+; RV64I-NEXT: bltu s0, t0, .LBB18_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: sra t6, a5, t6
+; RV64I-NEXT: bnez s0, .LBB18_20
+; RV64I-NEXT: j .LBB18_21
+; RV64I-NEXT: .LBB18_19:
+; RV64I-NEXT: srl t6, a7, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, a5, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB18_21
+; RV64I-NEXT: .LBB18_20:
+; RV64I-NEXT: mv a7, t6
+; RV64I-NEXT: .LBB18_21:
+; RV64I-NEXT: bltu s0, t0, .LBB18_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: srai t0, a5, 63
+; RV64I-NEXT: bltu a6, t1, .LBB18_24
+; RV64I-NEXT: j .LBB18_25
+; RV64I-NEXT: .LBB18_23:
+; RV64I-NEXT: sra t0, a5, s0
+; RV64I-NEXT: bgeu a6, t1, .LBB18_25
+; RV64I-NEXT: .LBB18_24:
+; RV64I-NEXT: or a7, t2, t5
+; RV64I-NEXT: or t0, t3, t4
+; RV64I-NEXT: .LBB18_25:
+; RV64I-NEXT: bnez a6, .LBB18_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a6, t1, .LBB18_28
+; RV64I-NEXT: .LBB18_27:
+; RV64I-NEXT: srai a1, a5, 63
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: .LBB18_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB18_29:
+; RV64I-NEXT: mv a4, a7
+; RV64I-NEXT: mv a3, t0
+; RV64I-NEXT: bgeu a6, t1, .LBB18_27
+; RV64I-NEXT: j .LBB18_28
+;
+; RV32I-LABEL: ashr_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or t4, t6, t5
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t5, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: or s0, t5, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t5, 32
+; RV32I-NEXT: slli a7, a4, 16
+; RV32I-NEXT: slli a1, a5, 16
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a5, t2, 16
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or a4, t1, t0
+; RV32I-NEXT: or a5, a5, s0
+; RV32I-NEXT: slli a5, a5, 3
+; RV32I-NEXT: srl s0, t2, a5
+; RV32I-NEXT: neg s6, a5
+; RV32I-NEXT: sll s1, a4, s6
+; RV32I-NEXT: bltu a5, t5, .LBB18_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t0, a4, a5
+; RV32I-NEXT: j .LBB18_3
+; RV32I-NEXT: .LBB18_2:
+; RV32I-NEXT: or t0, s0, s1
+; RV32I-NEXT: .LBB18_3:
+; RV32I-NEXT: or t1, a7, a3
+; RV32I-NEXT: or a7, a1, a6
+; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: beqz a5, .LBB18_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t3, t0
+; RV32I-NEXT: .LBB18_5:
+; RV32I-NEXT: srl a3, t1, a5
+; RV32I-NEXT: sll a1, a7, s6
+; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a5, t5, .LBB18_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai t4, a4, 31
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: j .LBB18_8
+; RV32I-NEXT: .LBB18_7:
+; RV32I-NEXT: sra t4, a4, a5
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: .LBB18_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: beqz a5, .LBB18_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: .LBB18_10:
+; RV32I-NEXT: sub s7, t6, a5
+; RV32I-NEXT: bltu a5, t5, .LBB18_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB18_13
+; RV32I-NEXT: .LBB18_12:
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: .LBB18_13:
+; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: bltu s7, t5, .LBB18_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a3, t2, s7
+; RV32I-NEXT: j .LBB18_16
+; RV32I-NEXT: .LBB18_15:
+; RV32I-NEXT: sll a6, t2, s6
+; RV32I-NEXT: srl a3, t2, s10
+; RV32I-NEXT: or a3, a3, s1
+; RV32I-NEXT: .LBB18_16:
+; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi s9, a5, -64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz s7, .LBB18_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, a3
+; RV32I-NEXT: .LBB18_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t5, .LBB18_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: sra s0, a4, s9
+; RV32I-NEXT: j .LBB18_21
+; RV32I-NEXT: .LBB18_20:
+; RV32I-NEXT: sll a3, a4, s11
+; RV32I-NEXT: or s0, s0, a3
+; RV32I-NEXT: .LBB18_21:
+; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: beqz s9, .LBB18_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv t4, s0
+; RV32I-NEXT: .LBB18_23:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu s1, 10(a0)
+; RV32I-NEXT: lbu s8, 13(a0)
+; RV32I-NEXT: lbu ra, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: bltu s9, t5, .LBB18_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: srai s0, a4, 31
+; RV32I-NEXT: j .LBB18_26
+; RV32I-NEXT: .LBB18_25:
+; RV32I-NEXT: sra s0, a4, a5
+; RV32I-NEXT: .LBB18_26:
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: lbu s5, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s4, s8, 8
+; RV32I-NEXT: or s8, a3, ra
+; RV32I-NEXT: bgeu a5, t6, .LBB18_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or t4, t0, a6
+; RV32I-NEXT: or s0, a1, t3
+; RV32I-NEXT: .LBB18_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: or a6, s2, s5
+; RV32I-NEXT: slli s2, s1, 16
+; RV32I-NEXT: or s1, s4, s3
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: mv t0, a7
+; RV32I-NEXT: beqz a5, .LBB18_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv a1, t4
+; RV32I-NEXT: mv t0, s0
+; RV32I-NEXT: .LBB18_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s3, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s4, t3, 8
+; RV32I-NEXT: or t4, s2, a6
+; RV32I-NEXT: or t3, s8, s1
+; RV32I-NEXT: bltu a5, t6, .LBB18_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: srai a6, a4, 31
+; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB18_32:
+; RV32I-NEXT: slli a6, ra, 8
+; RV32I-NEXT: or a3, s5, a3
+; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s0, s4, s0
+; RV32I-NEXT: srl s2, t4, a5
+; RV32I-NEXT: sll ra, t3, s6
+; RV32I-NEXT: bltu a5, t5, .LBB18_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: j .LBB18_35
+; RV32I-NEXT: .LBB18_34:
+; RV32I-NEXT: or s4, s2, ra
+; RV32I-NEXT: .LBB18_35:
+; RV32I-NEXT: or a6, a6, s1
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: or a0, s3, a0
+; RV32I-NEXT: slli s1, s0, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a5, .LBB18_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s4
+; RV32I-NEXT: .LBB18_37:
+; RV32I-NEXT: or s0, a3, a6
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: bltu a5, t5, .LBB18_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: srl a3, a0, a5
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: bnez a5, .LBB18_40
+; RV32I-NEXT: j .LBB18_41
+; RV32I-NEXT: .LBB18_39:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: srl a3, s0, a5
+; RV32I-NEXT: sll a6, a0, s6
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: beqz a5, .LBB18_41
+; RV32I-NEXT: .LBB18_40:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB18_41:
+; RV32I-NEXT: bltu a5, t5, .LBB18_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t5, .LBB18_45
+; RV32I-NEXT: .LBB18_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: bnez s7, .LBB18_46
+; RV32I-NEXT: j .LBB18_47
+; RV32I-NEXT: .LBB18_44:
+; RV32I-NEXT: srl s1, a0, a5
+; RV32I-NEXT: bltu s7, t5, .LBB18_43
+; RV32I-NEXT: .LBB18_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: beqz s7, .LBB18_47
+; RV32I-NEXT: .LBB18_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB18_47:
+; RV32I-NEXT: bltu s9, t5, .LBB18_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t3, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB18_50
+; RV32I-NEXT: j .LBB18_51
+; RV32I-NEXT: .LBB18_49:
+; RV32I-NEXT: sll a3, t3, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB18_51
+; RV32I-NEXT: .LBB18_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB18_51:
+; RV32I-NEXT: bltu s9, t5, .LBB18_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a5, t6, .LBB18_54
+; RV32I-NEXT: j .LBB18_55
+; RV32I-NEXT: .LBB18_53:
+; RV32I-NEXT: srl s7, t3, a5
+; RV32I-NEXT: bgeu a5, t6, .LBB18_55
+; RV32I-NEXT: .LBB18_54:
+; RV32I-NEXT: or s2, a6, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB18_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a5, .LBB18_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a6, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB18_57:
+; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a5
+; RV32I-NEXT: bltu a5, t6, .LBB18_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: .LBB18_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a6, t1, s3
+; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB18_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: j .LBB18_62
+; RV32I-NEXT: .LBB18_61:
+; RV32I-NEXT: sll s11, t1, s6
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: .LBB18_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a7
+; RV32I-NEXT: beqz s2, .LBB18_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB18_64:
+; RV32I-NEXT: bltu s1, t5, .LBB18_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a7, s1
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez s1, .LBB18_67
+; RV32I-NEXT: j .LBB18_68
+; RV32I-NEXT: .LBB18_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a7, a3
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz s1, .LBB18_68
+; RV32I-NEXT: .LBB18_67:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB18_68:
+; RV32I-NEXT: bltu s1, t5, .LBB18_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t5, .LBB18_72
+; RV32I-NEXT: .LBB18_70:
+; RV32I-NEXT: sll s6, t2, s6
+; RV32I-NEXT: srl a3, t2, s3
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB18_73
+; RV32I-NEXT: .LBB18_71:
+; RV32I-NEXT: srl s1, a7, s3
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB18_70
+; RV32I-NEXT: .LBB18_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t2, s2
+; RV32I-NEXT: .LBB18_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, a4
+; RV32I-NEXT: beqz s2, .LBB18_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB18_75:
+; RV32I-NEXT: bltu s9, t5, .LBB18_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t1, s9
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: bnez s9, .LBB18_78
+; RV32I-NEXT: j .LBB18_79
+; RV32I-NEXT: .LBB18_77:
+; RV32I-NEXT: sll s3, t1, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: sll s4, a7, s2
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: beqz s9, .LBB18_79
+; RV32I-NEXT: .LBB18_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB18_79:
+; RV32I-NEXT: bltu s2, t6, .LBB18_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB18_82
+; RV32I-NEXT: .LBB18_81:
+; RV32I-NEXT: or s3, a6, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB18_82:
+; RV32I-NEXT: addi ra, a5, -128
+; RV32I-NEXT: mv s4, t2
+; RV32I-NEXT: mv s6, a4
+; RV32I-NEXT: beqz s2, .LBB18_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s4, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB18_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, a4, s9
+; RV32I-NEXT: bltu ra, t5, .LBB18_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: sra a3, a4, ra
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: bnez ra, .LBB18_87
+; RV32I-NEXT: j .LBB18_88
+; RV32I-NEXT: .LBB18_86:
+; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: beqz ra, .LBB18_88
+; RV32I-NEXT: .LBB18_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB18_88:
+; RV32I-NEXT: bltu ra, t5, .LBB18_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: srai s2, a4, 31
+; RV32I-NEXT: srl a3, a7, ra
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez ra, .LBB18_91
+; RV32I-NEXT: j .LBB18_92
+; RV32I-NEXT: .LBB18_90:
+; RV32I-NEXT: sra s2, a4, a5
+; RV32I-NEXT: sll a3, a7, s9
+; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz ra, .LBB18_92
+; RV32I-NEXT: .LBB18_91:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB18_92:
+; RV32I-NEXT: mv s5, t0
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: bltu ra, t5, .LBB18_94
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: j .LBB18_95
+; RV32I-NEXT: .LBB18_94:
+; RV32I-NEXT: srl s7, a7, a5
+; RV32I-NEXT: .LBB18_95:
+; RV32I-NEXT: mv t6, s8
+; RV32I-NEXT: mv s8, s11
+; RV32I-NEXT: bltu s10, t5, .LBB18_97
+; RV32I-NEXT: # %bb.96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t2, s10
+; RV32I-NEXT: j .LBB18_98
+; RV32I-NEXT: .LBB18_97:
+; RV32I-NEXT: sll s9, t2, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t2, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: .LBB18_98:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, a4
+; RV32I-NEXT: beqz s10, .LBB18_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB18_100:
+; RV32I-NEXT: bltu s11, t5, .LBB18_102
+; RV32I-NEXT: # %bb.101:
+; RV32I-NEXT: sra a3, a4, s11
+; RV32I-NEXT: bnez s11, .LBB18_103
+; RV32I-NEXT: j .LBB18_104
+; RV32I-NEXT: .LBB18_102:
+; RV32I-NEXT: srl a3, t2, ra
+; RV32I-NEXT: mv s10, s4
+; RV32I-NEXT: neg s4, s11
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s4, s10
+; RV32I-NEXT: beqz s11, .LBB18_104
+; RV32I-NEXT: .LBB18_103:
+; RV32I-NEXT: mv t2, a3
+; RV32I-NEXT: .LBB18_104:
+; RV32I-NEXT: bltu s11, t5, .LBB18_106
+; RV32I-NEXT: # %bb.105:
+; RV32I-NEXT: srai t5, a4, 31
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bltu ra, t0, .LBB18_107
+; RV32I-NEXT: j .LBB18_108
+; RV32I-NEXT: .LBB18_106:
+; RV32I-NEXT: sra t5, a4, ra
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bgeu ra, t0, .LBB18_108
+; RV32I-NEXT: .LBB18_107:
+; RV32I-NEXT: or t2, a6, s9
+; RV32I-NEXT: or t5, s7, s3
+; RV32I-NEXT: .LBB18_108:
+; RV32I-NEXT: li a6, 128
+; RV32I-NEXT: bnez ra, .LBB18_117
+; RV32I-NEXT: # %bb.109:
+; RV32I-NEXT: bgeu ra, t0, .LBB18_118
+; RV32I-NEXT: .LBB18_110:
+; RV32I-NEXT: bgeu a5, a6, .LBB18_112
+; RV32I-NEXT: .LBB18_111:
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a3, s8
+; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a7, a3, t6
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s4
+; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: .LBB18_112:
+; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: mv t0, s5
+; RV32I-NEXT: beqz a5, .LBB18_114
+; RV32I-NEXT: # %bb.113:
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t3, s2
+; RV32I-NEXT: .LBB18_114:
+; RV32I-NEXT: bltu a5, a6, .LBB18_116
+; RV32I-NEXT: # %bb.115:
+; RV32I-NEXT: srai a1, a4, 31
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: mv s11, a1
+; RV32I-NEXT: mv ra, a1
+; RV32I-NEXT: .LBB18_116:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli a7, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t2, t3, 16
+; RV32I-NEXT: srli s3, t3, 24
+; RV32I-NEXT: srli s1, a1, 16
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: srli t6, t0, 16
+; RV32I-NEXT: srli s6, t0, 24
+; RV32I-NEXT: srli s5, s11, 16
+; RV32I-NEXT: srli s4, s11, 24
+; RV32I-NEXT: srli s7, ra, 16
+; RV32I-NEXT: srli s8, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s9, s0, t1
+; RV32I-NEXT: and s10, a0, t1
+; RV32I-NEXT: srli s9, s9, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s9, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli a7, s10, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t3, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, a1, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t3, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, t0, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a1, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a1, s11, t1
+; RV32I-NEXT: and a3, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb s11, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s4, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a3, 29(a2)
+; RV32I-NEXT: sb s7, 30(a2)
+; RV32I-NEXT: sb s8, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB18_117:
+; RV32I-NEXT: mv t1, t2
+; RV32I-NEXT: mv a7, t5
+; RV32I-NEXT: bltu ra, t0, .LBB18_110
+; RV32I-NEXT: .LBB18_118:
+; RV32I-NEXT: srai s1, a4, 31
+; RV32I-NEXT: mv s2, s1
+; RV32I-NEXT: bltu a5, a6, .LBB18_111
+; RV32I-NEXT: j .LBB18_112
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or a7, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t4, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli t3, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a7, t1, a7
+; RV64I-NEXT: or a5, t5, t4
+; RV64I-NEXT: or a6, s0, t6
+; RV64I-NEXT: slli a6, a6, 5
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sll t4, a5, t5
+; RV64I-NEXT: bltu a6, t0, .LBB19_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sra t6, a5, t1
+; RV64I-NEXT: j .LBB19_3
+; RV64I-NEXT: .LBB19_2:
+; RV64I-NEXT: srl t6, a7, a6
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: .LBB19_3:
+; RV64I-NEXT: or a3, t3, a3
+; RV64I-NEXT: slli t3, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a7
+; RV64I-NEXT: beqz a6, .LBB19_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB19_5:
+; RV64I-NEXT: or a4, t3, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a6, t0, .LBB19_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: srai a0, a5, 63
+; RV64I-NEXT: srl t3, a3, t1
+; RV64I-NEXT: j .LBB19_8
+; RV64I-NEXT: .LBB19_7:
+; RV64I-NEXT: sra a0, a5, a6
+; RV64I-NEXT: srl t1, a4, a6
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t3, t1, t2
+; RV64I-NEXT: .LBB19_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a6, .LBB19_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t3
+; RV64I-NEXT: .LBB19_10:
+; RV64I-NEXT: sub t6, t1, a6
+; RV64I-NEXT: bltu a6, t0, .LBB19_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t3, 0
+; RV64I-NEXT: bgeu t6, t0, .LBB19_14
+; RV64I-NEXT: .LBB19_12:
+; RV64I-NEXT: sll t5, a7, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a7, s0
+; RV64I-NEXT: or s1, s0, t4
+; RV64I-NEXT: j .LBB19_15
+; RV64I-NEXT: .LBB19_13:
+; RV64I-NEXT: srl t3, a3, a6
+; RV64I-NEXT: bltu t6, t0, .LBB19_12
+; RV64I-NEXT: .LBB19_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: sll s1, a7, t4
+; RV64I-NEXT: .LBB19_15:
+; RV64I-NEXT: sub s0, a6, t1
+; RV64I-NEXT: mv t4, a5
+; RV64I-NEXT: beqz t6, .LBB19_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t4, s1
+; RV64I-NEXT: .LBB19_17:
+; RV64I-NEXT: bltu s0, t0, .LBB19_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: sra t6, a5, t6
+; RV64I-NEXT: bnez s0, .LBB19_20
+; RV64I-NEXT: j .LBB19_21
+; RV64I-NEXT: .LBB19_19:
+; RV64I-NEXT: srl t6, a7, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, a5, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB19_21
+; RV64I-NEXT: .LBB19_20:
+; RV64I-NEXT: mv a7, t6
+; RV64I-NEXT: .LBB19_21:
+; RV64I-NEXT: bltu s0, t0, .LBB19_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: srai t0, a5, 63
+; RV64I-NEXT: bltu a6, t1, .LBB19_24
+; RV64I-NEXT: j .LBB19_25
+; RV64I-NEXT: .LBB19_23:
+; RV64I-NEXT: sra t0, a5, s0
+; RV64I-NEXT: bgeu a6, t1, .LBB19_25
+; RV64I-NEXT: .LBB19_24:
+; RV64I-NEXT: or a7, t2, t5
+; RV64I-NEXT: or t0, t3, t4
+; RV64I-NEXT: .LBB19_25:
+; RV64I-NEXT: bnez a6, .LBB19_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a6, t1, .LBB19_28
+; RV64I-NEXT: .LBB19_27:
+; RV64I-NEXT: srai a1, a5, 63
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: .LBB19_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB19_29:
+; RV64I-NEXT: mv a4, a7
+; RV64I-NEXT: mv a3, t0
+; RV64I-NEXT: bgeu a6, t1, .LBB19_27
+; RV64I-NEXT: j .LBB19_28
+;
+; RV32I-LABEL: ashr_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or t4, t6, t5
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t5, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: or s0, t5, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t5, 32
+; RV32I-NEXT: slli a7, a4, 16
+; RV32I-NEXT: slli a1, a5, 16
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a5, t2, 16
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or a4, t1, t0
+; RV32I-NEXT: or a5, a5, s0
+; RV32I-NEXT: slli a5, a5, 5
+; RV32I-NEXT: srl s0, t2, a5
+; RV32I-NEXT: neg s6, a5
+; RV32I-NEXT: sll s1, a4, s6
+; RV32I-NEXT: bltu a5, t5, .LBB19_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t0, a4, a5
+; RV32I-NEXT: j .LBB19_3
+; RV32I-NEXT: .LBB19_2:
+; RV32I-NEXT: or t0, s0, s1
+; RV32I-NEXT: .LBB19_3:
+; RV32I-NEXT: or t1, a7, a3
+; RV32I-NEXT: or a7, a1, a6
+; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: beqz a5, .LBB19_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t3, t0
+; RV32I-NEXT: .LBB19_5:
+; RV32I-NEXT: srl a3, t1, a5
+; RV32I-NEXT: sll a1, a7, s6
+; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a5, t5, .LBB19_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai t4, a4, 31
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: j .LBB19_8
+; RV32I-NEXT: .LBB19_7:
+; RV32I-NEXT: sra t4, a4, a5
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: .LBB19_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: beqz a5, .LBB19_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: .LBB19_10:
+; RV32I-NEXT: sub s7, t6, a5
+; RV32I-NEXT: bltu a5, t5, .LBB19_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB19_13
+; RV32I-NEXT: .LBB19_12:
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: .LBB19_13:
+; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: bltu s7, t5, .LBB19_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a3, t2, s7
+; RV32I-NEXT: j .LBB19_16
+; RV32I-NEXT: .LBB19_15:
+; RV32I-NEXT: sll a6, t2, s6
+; RV32I-NEXT: srl a3, t2, s10
+; RV32I-NEXT: or a3, a3, s1
+; RV32I-NEXT: .LBB19_16:
+; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi s9, a5, -64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz s7, .LBB19_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, a3
+; RV32I-NEXT: .LBB19_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t5, .LBB19_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: sra s0, a4, s9
+; RV32I-NEXT: j .LBB19_21
+; RV32I-NEXT: .LBB19_20:
+; RV32I-NEXT: sll a3, a4, s11
+; RV32I-NEXT: or s0, s0, a3
+; RV32I-NEXT: .LBB19_21:
+; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: beqz s9, .LBB19_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv t4, s0
+; RV32I-NEXT: .LBB19_23:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu s1, 10(a0)
+; RV32I-NEXT: lbu s8, 13(a0)
+; RV32I-NEXT: lbu ra, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: bltu s9, t5, .LBB19_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: srai s0, a4, 31
+; RV32I-NEXT: j .LBB19_26
+; RV32I-NEXT: .LBB19_25:
+; RV32I-NEXT: sra s0, a4, a5
+; RV32I-NEXT: .LBB19_26:
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: lbu s5, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s4, s8, 8
+; RV32I-NEXT: or s8, a3, ra
+; RV32I-NEXT: bgeu a5, t6, .LBB19_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or t4, t0, a6
+; RV32I-NEXT: or s0, a1, t3
+; RV32I-NEXT: .LBB19_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: or a6, s2, s5
+; RV32I-NEXT: slli s2, s1, 16
+; RV32I-NEXT: or s1, s4, s3
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: mv t0, a7
+; RV32I-NEXT: beqz a5, .LBB19_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv a1, t4
+; RV32I-NEXT: mv t0, s0
+; RV32I-NEXT: .LBB19_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s3, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s4, t3, 8
+; RV32I-NEXT: or t4, s2, a6
+; RV32I-NEXT: or t3, s8, s1
+; RV32I-NEXT: bltu a5, t6, .LBB19_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: srai a6, a4, 31
+; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB19_32:
+; RV32I-NEXT: slli a6, ra, 8
+; RV32I-NEXT: or a3, s5, a3
+; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s0, s4, s0
+; RV32I-NEXT: srl s2, t4, a5
+; RV32I-NEXT: sll ra, t3, s6
+; RV32I-NEXT: bltu a5, t5, .LBB19_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: j .LBB19_35
+; RV32I-NEXT: .LBB19_34:
+; RV32I-NEXT: or s4, s2, ra
+; RV32I-NEXT: .LBB19_35:
+; RV32I-NEXT: or a6, a6, s1
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: or a0, s3, a0
+; RV32I-NEXT: slli s1, s0, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a5, .LBB19_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s4
+; RV32I-NEXT: .LBB19_37:
+; RV32I-NEXT: or s0, a3, a6
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: bltu a5, t5, .LBB19_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: srl a3, a0, a5
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: bnez a5, .LBB19_40
+; RV32I-NEXT: j .LBB19_41
+; RV32I-NEXT: .LBB19_39:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: srl a3, s0, a5
+; RV32I-NEXT: sll a6, a0, s6
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: beqz a5, .LBB19_41
+; RV32I-NEXT: .LBB19_40:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB19_41:
+; RV32I-NEXT: bltu a5, t5, .LBB19_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t5, .LBB19_45
+; RV32I-NEXT: .LBB19_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: bnez s7, .LBB19_46
+; RV32I-NEXT: j .LBB19_47
+; RV32I-NEXT: .LBB19_44:
+; RV32I-NEXT: srl s1, a0, a5
+; RV32I-NEXT: bltu s7, t5, .LBB19_43
+; RV32I-NEXT: .LBB19_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: beqz s7, .LBB19_47
+; RV32I-NEXT: .LBB19_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB19_47:
+; RV32I-NEXT: bltu s9, t5, .LBB19_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t3, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB19_50
+; RV32I-NEXT: j .LBB19_51
+; RV32I-NEXT: .LBB19_49:
+; RV32I-NEXT: sll a3, t3, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB19_51
+; RV32I-NEXT: .LBB19_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB19_51:
+; RV32I-NEXT: bltu s9, t5, .LBB19_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a5, t6, .LBB19_54
+; RV32I-NEXT: j .LBB19_55
+; RV32I-NEXT: .LBB19_53:
+; RV32I-NEXT: srl s7, t3, a5
+; RV32I-NEXT: bgeu a5, t6, .LBB19_55
+; RV32I-NEXT: .LBB19_54:
+; RV32I-NEXT: or s2, a6, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB19_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a5, .LBB19_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a6, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB19_57:
+; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a5
+; RV32I-NEXT: bltu a5, t6, .LBB19_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: .LBB19_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a6, t1, s3
+; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB19_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: j .LBB19_62
+; RV32I-NEXT: .LBB19_61:
+; RV32I-NEXT: sll s11, t1, s6
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: .LBB19_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a7
+; RV32I-NEXT: beqz s2, .LBB19_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB19_64:
+; RV32I-NEXT: bltu s1, t5, .LBB19_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a7, s1
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez s1, .LBB19_67
+; RV32I-NEXT: j .LBB19_68
+; RV32I-NEXT: .LBB19_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a7, a3
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz s1, .LBB19_68
+; RV32I-NEXT: .LBB19_67:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB19_68:
+; RV32I-NEXT: bltu s1, t5, .LBB19_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t5, .LBB19_72
+; RV32I-NEXT: .LBB19_70:
+; RV32I-NEXT: sll s6, t2, s6
+; RV32I-NEXT: srl a3, t2, s3
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB19_73
+; RV32I-NEXT: .LBB19_71:
+; RV32I-NEXT: srl s1, a7, s3
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB19_70
+; RV32I-NEXT: .LBB19_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t2, s2
+; RV32I-NEXT: .LBB19_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, a4
+; RV32I-NEXT: beqz s2, .LBB19_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB19_75:
+; RV32I-NEXT: bltu s9, t5, .LBB19_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t1, s9
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: bnez s9, .LBB19_78
+; RV32I-NEXT: j .LBB19_79
+; RV32I-NEXT: .LBB19_77:
+; RV32I-NEXT: sll s3, t1, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: sll s4, a7, s2
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: beqz s9, .LBB19_79
+; RV32I-NEXT: .LBB19_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB19_79:
+; RV32I-NEXT: bltu s2, t6, .LBB19_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB19_82
+; RV32I-NEXT: .LBB19_81:
+; RV32I-NEXT: or s3, a6, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB19_82:
+; RV32I-NEXT: addi ra, a5, -128
+; RV32I-NEXT: mv s4, t2
+; RV32I-NEXT: mv s6, a4
+; RV32I-NEXT: beqz s2, .LBB19_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s4, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB19_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, a4, s9
+; RV32I-NEXT: bltu ra, t5, .LBB19_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: sra a3, a4, ra
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: bnez ra, .LBB19_87
+; RV32I-NEXT: j .LBB19_88
+; RV32I-NEXT: .LBB19_86:
+; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: beqz ra, .LBB19_88
+; RV32I-NEXT: .LBB19_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB19_88:
+; RV32I-NEXT: bltu ra, t5, .LBB19_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: srai s2, a4, 31
+; RV32I-NEXT: srl a3, a7, ra
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez ra, .LBB19_91
+; RV32I-NEXT: j .LBB19_92
+; RV32I-NEXT: .LBB19_90:
+; RV32I-NEXT: sra s2, a4, a5
+; RV32I-NEXT: sll a3, a7, s9
+; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz ra, .LBB19_92
+; RV32I-NEXT: .LBB19_91:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB19_92:
+; RV32I-NEXT: mv s5, t0
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: bltu ra, t5, .LBB19_94
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: j .LBB19_95
+; RV32I-NEXT: .LBB19_94:
+; RV32I-NEXT: srl s7, a7, a5
+; RV32I-NEXT: .LBB19_95:
+; RV32I-NEXT: mv t6, s8
+; RV32I-NEXT: mv s8, s11
+; RV32I-NEXT: bltu s10, t5, .LBB19_97
+; RV32I-NEXT: # %bb.96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t2, s10
+; RV32I-NEXT: j .LBB19_98
+; RV32I-NEXT: .LBB19_97:
+; RV32I-NEXT: sll s9, t2, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t2, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: .LBB19_98:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, a4
+; RV32I-NEXT: beqz s10, .LBB19_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB19_100:
+; RV32I-NEXT: bltu s11, t5, .LBB19_102
+; RV32I-NEXT: # %bb.101:
+; RV32I-NEXT: sra a3, a4, s11
+; RV32I-NEXT: bnez s11, .LBB19_103
+; RV32I-NEXT: j .LBB19_104
+; RV32I-NEXT: .LBB19_102:
+; RV32I-NEXT: srl a3, t2, ra
+; RV32I-NEXT: mv s10, s4
+; RV32I-NEXT: neg s4, s11
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s4, s10
+; RV32I-NEXT: beqz s11, .LBB19_104
+; RV32I-NEXT: .LBB19_103:
+; RV32I-NEXT: mv t2, a3
+; RV32I-NEXT: .LBB19_104:
+; RV32I-NEXT: bltu s11, t5, .LBB19_106
+; RV32I-NEXT: # %bb.105:
+; RV32I-NEXT: srai t5, a4, 31
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bltu ra, t0, .LBB19_107
+; RV32I-NEXT: j .LBB19_108
+; RV32I-NEXT: .LBB19_106:
+; RV32I-NEXT: sra t5, a4, ra
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bgeu ra, t0, .LBB19_108
+; RV32I-NEXT: .LBB19_107:
+; RV32I-NEXT: or t2, a6, s9
+; RV32I-NEXT: or t5, s7, s3
+; RV32I-NEXT: .LBB19_108:
+; RV32I-NEXT: li a6, 128
+; RV32I-NEXT: bnez ra, .LBB19_117
+; RV32I-NEXT: # %bb.109:
+; RV32I-NEXT: bgeu ra, t0, .LBB19_118
+; RV32I-NEXT: .LBB19_110:
+; RV32I-NEXT: bgeu a5, a6, .LBB19_112
+; RV32I-NEXT: .LBB19_111:
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a3, s8
+; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a7, a3, t6
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s4
+; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: .LBB19_112:
+; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: mv t0, s5
+; RV32I-NEXT: beqz a5, .LBB19_114
+; RV32I-NEXT: # %bb.113:
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t3, s2
+; RV32I-NEXT: .LBB19_114:
+; RV32I-NEXT: bltu a5, a6, .LBB19_116
+; RV32I-NEXT: # %bb.115:
+; RV32I-NEXT: srai a1, a4, 31
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: mv s11, a1
+; RV32I-NEXT: mv ra, a1
+; RV32I-NEXT: .LBB19_116:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli a7, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t2, t3, 16
+; RV32I-NEXT: srli s3, t3, 24
+; RV32I-NEXT: srli s1, a1, 16
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: srli t6, t0, 16
+; RV32I-NEXT: srli s6, t0, 24
+; RV32I-NEXT: srli s5, s11, 16
+; RV32I-NEXT: srli s4, s11, 24
+; RV32I-NEXT: srli s7, ra, 16
+; RV32I-NEXT: srli s8, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s9, s0, t1
+; RV32I-NEXT: and s10, a0, t1
+; RV32I-NEXT: srli s9, s9, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s9, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli a7, s10, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t3, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, a1, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t3, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, t0, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a1, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a1, s11, t1
+; RV32I-NEXT: and a3, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb s11, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s4, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a3, 29(a2)
+; RV32I-NEXT: sb s7, 30(a2)
+; RV32I-NEXT: sb s8, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB19_117:
+; RV32I-NEXT: mv t1, t2
+; RV32I-NEXT: mv a7, t5
+; RV32I-NEXT: bltu ra, t0, .LBB19_110
+; RV32I-NEXT: .LBB19_118:
+; RV32I-NEXT: srai s1, a4, 31
+; RV32I-NEXT: mv s2, s1
+; RV32I-NEXT: bltu a5, a6, .LBB19_111
+; RV32I-NEXT: j .LBB19_112
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or a7, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t4, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli t3, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a7, t1, a7
+; RV64I-NEXT: or a5, t5, t4
+; RV64I-NEXT: or a6, s0, t6
+; RV64I-NEXT: slli a6, a6, 6
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sll t4, a5, t5
+; RV64I-NEXT: bltu a6, t0, .LBB20_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sra t6, a5, t1
+; RV64I-NEXT: j .LBB20_3
+; RV64I-NEXT: .LBB20_2:
+; RV64I-NEXT: srl t6, a7, a6
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: .LBB20_3:
+; RV64I-NEXT: or a3, t3, a3
+; RV64I-NEXT: slli t3, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a7
+; RV64I-NEXT: beqz a6, .LBB20_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB20_5:
+; RV64I-NEXT: or a4, t3, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a6, t0, .LBB20_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: srai a0, a5, 63
+; RV64I-NEXT: srl t3, a3, t1
+; RV64I-NEXT: j .LBB20_8
+; RV64I-NEXT: .LBB20_7:
+; RV64I-NEXT: sra a0, a5, a6
+; RV64I-NEXT: srl t1, a4, a6
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t3, t1, t2
+; RV64I-NEXT: .LBB20_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a6, .LBB20_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t3
+; RV64I-NEXT: .LBB20_10:
+; RV64I-NEXT: sub t6, t1, a6
+; RV64I-NEXT: bltu a6, t0, .LBB20_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t3, 0
+; RV64I-NEXT: bgeu t6, t0, .LBB20_14
+; RV64I-NEXT: .LBB20_12:
+; RV64I-NEXT: sll t5, a7, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a7, s0
+; RV64I-NEXT: or s1, s0, t4
+; RV64I-NEXT: j .LBB20_15
+; RV64I-NEXT: .LBB20_13:
+; RV64I-NEXT: srl t3, a3, a6
+; RV64I-NEXT: bltu t6, t0, .LBB20_12
+; RV64I-NEXT: .LBB20_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: sll s1, a7, t4
+; RV64I-NEXT: .LBB20_15:
+; RV64I-NEXT: sub s0, a6, t1
+; RV64I-NEXT: mv t4, a5
+; RV64I-NEXT: beqz t6, .LBB20_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t4, s1
+; RV64I-NEXT: .LBB20_17:
+; RV64I-NEXT: bltu s0, t0, .LBB20_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: sra t6, a5, t6
+; RV64I-NEXT: bnez s0, .LBB20_20
+; RV64I-NEXT: j .LBB20_21
+; RV64I-NEXT: .LBB20_19:
+; RV64I-NEXT: srl t6, a7, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, a5, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB20_21
+; RV64I-NEXT: .LBB20_20:
+; RV64I-NEXT: mv a7, t6
+; RV64I-NEXT: .LBB20_21:
+; RV64I-NEXT: bltu s0, t0, .LBB20_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: srai t0, a5, 63
+; RV64I-NEXT: bltu a6, t1, .LBB20_24
+; RV64I-NEXT: j .LBB20_25
+; RV64I-NEXT: .LBB20_23:
+; RV64I-NEXT: sra t0, a5, s0
+; RV64I-NEXT: bgeu a6, t1, .LBB20_25
+; RV64I-NEXT: .LBB20_24:
+; RV64I-NEXT: or a7, t2, t5
+; RV64I-NEXT: or t0, t3, t4
+; RV64I-NEXT: .LBB20_25:
+; RV64I-NEXT: bnez a6, .LBB20_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a6, t1, .LBB20_28
+; RV64I-NEXT: .LBB20_27:
+; RV64I-NEXT: srai a1, a5, 63
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: .LBB20_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB20_29:
+; RV64I-NEXT: mv a4, a7
+; RV64I-NEXT: mv a3, t0
+; RV64I-NEXT: bgeu a6, t1, .LBB20_27
+; RV64I-NEXT: j .LBB20_28
+;
+; RV32I-LABEL: ashr_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or t4, t6, t5
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t5, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: or s0, t5, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t5, 32
+; RV32I-NEXT: slli a7, a4, 16
+; RV32I-NEXT: slli a1, a5, 16
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a5, t2, 16
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or a4, t1, t0
+; RV32I-NEXT: or a5, a5, s0
+; RV32I-NEXT: slli a5, a5, 6
+; RV32I-NEXT: srl s0, t2, a5
+; RV32I-NEXT: neg s6, a5
+; RV32I-NEXT: sll s1, a4, s6
+; RV32I-NEXT: bltu a5, t5, .LBB20_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t0, a4, a5
+; RV32I-NEXT: j .LBB20_3
+; RV32I-NEXT: .LBB20_2:
+; RV32I-NEXT: or t0, s0, s1
+; RV32I-NEXT: .LBB20_3:
+; RV32I-NEXT: or t1, a7, a3
+; RV32I-NEXT: or a7, a1, a6
+; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: beqz a5, .LBB20_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t3, t0
+; RV32I-NEXT: .LBB20_5:
+; RV32I-NEXT: srl a3, t1, a5
+; RV32I-NEXT: sll a1, a7, s6
+; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a5, t5, .LBB20_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai t4, a4, 31
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: j .LBB20_8
+; RV32I-NEXT: .LBB20_7:
+; RV32I-NEXT: sra t4, a4, a5
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: .LBB20_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: beqz a5, .LBB20_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: .LBB20_10:
+; RV32I-NEXT: sub s7, t6, a5
+; RV32I-NEXT: bltu a5, t5, .LBB20_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB20_13
+; RV32I-NEXT: .LBB20_12:
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: .LBB20_13:
+; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: bltu s7, t5, .LBB20_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a3, t2, s7
+; RV32I-NEXT: j .LBB20_16
+; RV32I-NEXT: .LBB20_15:
+; RV32I-NEXT: sll a6, t2, s6
+; RV32I-NEXT: srl a3, t2, s10
+; RV32I-NEXT: or a3, a3, s1
+; RV32I-NEXT: .LBB20_16:
+; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi s9, a5, -64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz s7, .LBB20_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, a3
+; RV32I-NEXT: .LBB20_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t5, .LBB20_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: sra s0, a4, s9
+; RV32I-NEXT: j .LBB20_21
+; RV32I-NEXT: .LBB20_20:
+; RV32I-NEXT: sll a3, a4, s11
+; RV32I-NEXT: or s0, s0, a3
+; RV32I-NEXT: .LBB20_21:
+; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: beqz s9, .LBB20_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv t4, s0
+; RV32I-NEXT: .LBB20_23:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu s1, 10(a0)
+; RV32I-NEXT: lbu s8, 13(a0)
+; RV32I-NEXT: lbu ra, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: bltu s9, t5, .LBB20_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: srai s0, a4, 31
+; RV32I-NEXT: j .LBB20_26
+; RV32I-NEXT: .LBB20_25:
+; RV32I-NEXT: sra s0, a4, a5
+; RV32I-NEXT: .LBB20_26:
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: lbu s5, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s4, s8, 8
+; RV32I-NEXT: or s8, a3, ra
+; RV32I-NEXT: bgeu a5, t6, .LBB20_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or t4, t0, a6
+; RV32I-NEXT: or s0, a1, t3
+; RV32I-NEXT: .LBB20_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: or a6, s2, s5
+; RV32I-NEXT: slli s2, s1, 16
+; RV32I-NEXT: or s1, s4, s3
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: mv t0, a7
+; RV32I-NEXT: beqz a5, .LBB20_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv a1, t4
+; RV32I-NEXT: mv t0, s0
+; RV32I-NEXT: .LBB20_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s3, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s4, t3, 8
+; RV32I-NEXT: or t4, s2, a6
+; RV32I-NEXT: or t3, s8, s1
+; RV32I-NEXT: bltu a5, t6, .LBB20_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: srai a6, a4, 31
+; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB20_32:
+; RV32I-NEXT: slli a6, ra, 8
+; RV32I-NEXT: or a3, s5, a3
+; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s0, s4, s0
+; RV32I-NEXT: srl s2, t4, a5
+; RV32I-NEXT: sll ra, t3, s6
+; RV32I-NEXT: bltu a5, t5, .LBB20_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: j .LBB20_35
+; RV32I-NEXT: .LBB20_34:
+; RV32I-NEXT: or s4, s2, ra
+; RV32I-NEXT: .LBB20_35:
+; RV32I-NEXT: or a6, a6, s1
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: or a0, s3, a0
+; RV32I-NEXT: slli s1, s0, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a5, .LBB20_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s4
+; RV32I-NEXT: .LBB20_37:
+; RV32I-NEXT: or s0, a3, a6
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: bltu a5, t5, .LBB20_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: srl a3, a0, a5
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: bnez a5, .LBB20_40
+; RV32I-NEXT: j .LBB20_41
+; RV32I-NEXT: .LBB20_39:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: srl a3, s0, a5
+; RV32I-NEXT: sll a6, a0, s6
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: beqz a5, .LBB20_41
+; RV32I-NEXT: .LBB20_40:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB20_41:
+; RV32I-NEXT: bltu a5, t5, .LBB20_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t5, .LBB20_45
+; RV32I-NEXT: .LBB20_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: bnez s7, .LBB20_46
+; RV32I-NEXT: j .LBB20_47
+; RV32I-NEXT: .LBB20_44:
+; RV32I-NEXT: srl s1, a0, a5
+; RV32I-NEXT: bltu s7, t5, .LBB20_43
+; RV32I-NEXT: .LBB20_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: beqz s7, .LBB20_47
+; RV32I-NEXT: .LBB20_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB20_47:
+; RV32I-NEXT: bltu s9, t5, .LBB20_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t3, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB20_50
+; RV32I-NEXT: j .LBB20_51
+; RV32I-NEXT: .LBB20_49:
+; RV32I-NEXT: sll a3, t3, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB20_51
+; RV32I-NEXT: .LBB20_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB20_51:
+; RV32I-NEXT: bltu s9, t5, .LBB20_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a5, t6, .LBB20_54
+; RV32I-NEXT: j .LBB20_55
+; RV32I-NEXT: .LBB20_53:
+; RV32I-NEXT: srl s7, t3, a5
+; RV32I-NEXT: bgeu a5, t6, .LBB20_55
+; RV32I-NEXT: .LBB20_54:
+; RV32I-NEXT: or s2, a6, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB20_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a5, .LBB20_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a6, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB20_57:
+; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a5
+; RV32I-NEXT: bltu a5, t6, .LBB20_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: .LBB20_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a6, t1, s3
+; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB20_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: j .LBB20_62
+; RV32I-NEXT: .LBB20_61:
+; RV32I-NEXT: sll s11, t1, s6
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: .LBB20_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a7
+; RV32I-NEXT: beqz s2, .LBB20_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB20_64:
+; RV32I-NEXT: bltu s1, t5, .LBB20_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a7, s1
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez s1, .LBB20_67
+; RV32I-NEXT: j .LBB20_68
+; RV32I-NEXT: .LBB20_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a7, a3
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz s1, .LBB20_68
+; RV32I-NEXT: .LBB20_67:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB20_68:
+; RV32I-NEXT: bltu s1, t5, .LBB20_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t5, .LBB20_72
+; RV32I-NEXT: .LBB20_70:
+; RV32I-NEXT: sll s6, t2, s6
+; RV32I-NEXT: srl a3, t2, s3
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB20_73
+; RV32I-NEXT: .LBB20_71:
+; RV32I-NEXT: srl s1, a7, s3
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB20_70
+; RV32I-NEXT: .LBB20_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t2, s2
+; RV32I-NEXT: .LBB20_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, a4
+; RV32I-NEXT: beqz s2, .LBB20_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB20_75:
+; RV32I-NEXT: bltu s9, t5, .LBB20_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t1, s9
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: bnez s9, .LBB20_78
+; RV32I-NEXT: j .LBB20_79
+; RV32I-NEXT: .LBB20_77:
+; RV32I-NEXT: sll s3, t1, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: sll s4, a7, s2
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: beqz s9, .LBB20_79
+; RV32I-NEXT: .LBB20_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB20_79:
+; RV32I-NEXT: bltu s2, t6, .LBB20_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB20_82
+; RV32I-NEXT: .LBB20_81:
+; RV32I-NEXT: or s3, a6, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB20_82:
+; RV32I-NEXT: addi ra, a5, -128
+; RV32I-NEXT: mv s4, t2
+; RV32I-NEXT: mv s6, a4
+; RV32I-NEXT: beqz s2, .LBB20_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s4, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB20_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, a4, s9
+; RV32I-NEXT: bltu ra, t5, .LBB20_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: sra a3, a4, ra
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: bnez ra, .LBB20_87
+; RV32I-NEXT: j .LBB20_88
+; RV32I-NEXT: .LBB20_86:
+; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: beqz ra, .LBB20_88
+; RV32I-NEXT: .LBB20_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB20_88:
+; RV32I-NEXT: bltu ra, t5, .LBB20_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: srai s2, a4, 31
+; RV32I-NEXT: srl a3, a7, ra
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez ra, .LBB20_91
+; RV32I-NEXT: j .LBB20_92
+; RV32I-NEXT: .LBB20_90:
+; RV32I-NEXT: sra s2, a4, a5
+; RV32I-NEXT: sll a3, a7, s9
+; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz ra, .LBB20_92
+; RV32I-NEXT: .LBB20_91:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB20_92:
+; RV32I-NEXT: mv s5, t0
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: bltu ra, t5, .LBB20_94
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: j .LBB20_95
+; RV32I-NEXT: .LBB20_94:
+; RV32I-NEXT: srl s7, a7, a5
+; RV32I-NEXT: .LBB20_95:
+; RV32I-NEXT: mv t6, s8
+; RV32I-NEXT: mv s8, s11
+; RV32I-NEXT: bltu s10, t5, .LBB20_97
+; RV32I-NEXT: # %bb.96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t2, s10
+; RV32I-NEXT: j .LBB20_98
+; RV32I-NEXT: .LBB20_97:
+; RV32I-NEXT: sll s9, t2, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t2, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: .LBB20_98:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, a4
+; RV32I-NEXT: beqz s10, .LBB20_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB20_100:
+; RV32I-NEXT: bltu s11, t5, .LBB20_102
+; RV32I-NEXT: # %bb.101:
+; RV32I-NEXT: sra a3, a4, s11
+; RV32I-NEXT: bnez s11, .LBB20_103
+; RV32I-NEXT: j .LBB20_104
+; RV32I-NEXT: .LBB20_102:
+; RV32I-NEXT: srl a3, t2, ra
+; RV32I-NEXT: mv s10, s4
+; RV32I-NEXT: neg s4, s11
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s4, s10
+; RV32I-NEXT: beqz s11, .LBB20_104
+; RV32I-NEXT: .LBB20_103:
+; RV32I-NEXT: mv t2, a3
+; RV32I-NEXT: .LBB20_104:
+; RV32I-NEXT: bltu s11, t5, .LBB20_106
+; RV32I-NEXT: # %bb.105:
+; RV32I-NEXT: srai t5, a4, 31
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bltu ra, t0, .LBB20_107
+; RV32I-NEXT: j .LBB20_108
+; RV32I-NEXT: .LBB20_106:
+; RV32I-NEXT: sra t5, a4, ra
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bgeu ra, t0, .LBB20_108
+; RV32I-NEXT: .LBB20_107:
+; RV32I-NEXT: or t2, a6, s9
+; RV32I-NEXT: or t5, s7, s3
+; RV32I-NEXT: .LBB20_108:
+; RV32I-NEXT: li a6, 128
+; RV32I-NEXT: bnez ra, .LBB20_117
+; RV32I-NEXT: # %bb.109:
+; RV32I-NEXT: bgeu ra, t0, .LBB20_118
+; RV32I-NEXT: .LBB20_110:
+; RV32I-NEXT: bgeu a5, a6, .LBB20_112
+; RV32I-NEXT: .LBB20_111:
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a3, s8
+; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a7, a3, t6
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s4
+; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: .LBB20_112:
+; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: mv t0, s5
+; RV32I-NEXT: beqz a5, .LBB20_114
+; RV32I-NEXT: # %bb.113:
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t3, s2
+; RV32I-NEXT: .LBB20_114:
+; RV32I-NEXT: bltu a5, a6, .LBB20_116
+; RV32I-NEXT: # %bb.115:
+; RV32I-NEXT: srai a1, a4, 31
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: mv s11, a1
+; RV32I-NEXT: mv ra, a1
+; RV32I-NEXT: .LBB20_116:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli a7, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t2, t3, 16
+; RV32I-NEXT: srli s3, t3, 24
+; RV32I-NEXT: srli s1, a1, 16
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: srli t6, t0, 16
+; RV32I-NEXT: srli s6, t0, 24
+; RV32I-NEXT: srli s5, s11, 16
+; RV32I-NEXT: srli s4, s11, 24
+; RV32I-NEXT: srli s7, ra, 16
+; RV32I-NEXT: srli s8, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s9, s0, t1
+; RV32I-NEXT: and s10, a0, t1
+; RV32I-NEXT: srli s9, s9, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s9, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli a7, s10, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t3, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, a1, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t3, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, t0, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a1, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a1, s11, t1
+; RV32I-NEXT: and a3, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb s11, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s4, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a3, 29(a2)
+; RV32I-NEXT: sb s7, 30(a2)
+; RV32I-NEXT: sb s8, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB20_117:
+; RV32I-NEXT: mv t1, t2
+; RV32I-NEXT: mv a7, t5
+; RV32I-NEXT: bltu ra, t0, .LBB20_110
+; RV32I-NEXT: .LBB20_118:
+; RV32I-NEXT: srai s1, a4, 31
+; RV32I-NEXT: mv s2, s1
+; RV32I-NEXT: bltu a5, a6, .LBB20_111
+; RV32I-NEXT: j .LBB20_112
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
>From 40d2d951cf493a5e2509a4699b9673c4638f5da3 Mon Sep 17 00:00:00 2001
From: Luke Quinn <quic_lquinn at quicinc.com>
Date: Wed, 22 Jan 2025 08:30:40 -0800
Subject: [PATCH 2/3] [RISCV] Add GISelPredicateCode TD bindings and
hasAllNBitUsers prototypes for staging adding full support
Signed-off-by: Luke Quinn <quic_lquinn at quicinc.com>
---
.../Target/RISCV/GISel/RISCVInstructionSelector.cpp | 10 ++++++++++
llvm/lib/Target/RISCV/RISCVInstrInfo.td | 8 ++++++--
2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 3f1539da4a9c848..5ea9f728ff995eb 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -57,6 +57,12 @@ class RISCVInstructionSelector : public InstructionSelector {
const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) const;
+ // const MachineInstr &MI
+ bool hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth = 0) const;
+ bool hasAllBUsers(const MachineInstr &MI) const { return hasAllNBitUsers(MI, 8); }
+ bool hasAllHUsers(const MachineInstr &MI) const { return hasAllNBitUsers(MI, 16); }
+ bool hasAllWUsers(const MachineInstr &MI) const { return hasAllNBitUsers(MI, 32); }
+
bool isRegInGprb(Register Reg) const;
bool isRegInFprb(Register Reg) const;
@@ -186,6 +192,10 @@ RISCVInstructionSelector::RISCVInstructionSelector(
{
}
+bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth) const {
+ return false;
+};
+
InstructionSelector::ComplexRendererFns
RISCVInstructionSelector::selectShiftMask(MachineOperand &Root,
unsigned ShiftWidth) const {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index bb5bb6352c32a5a..fbfc354daa2f29a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1945,7 +1945,9 @@ class binop_allhusers<SDPatternOperator operator>
: PatFrag<(ops node:$lhs, node:$rhs),
(XLenVT (operator node:$lhs, node:$rhs)), [{
return hasAllHUsers(Node);
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return hasAllHUsers(MI); }];
+}
// PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl
// if only the lower 32 bits of their result is used.
@@ -1953,7 +1955,9 @@ class binop_allwusers<SDPatternOperator operator>
: PatFrag<(ops node:$lhs, node:$rhs),
(i64 (operator node:$lhs, node:$rhs)), [{
return hasAllWUsers(Node);
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return hasAllWUsers(MI); }];
+}
def sexti32_allwusers : PatFrag<(ops node:$src),
(sext_inreg node:$src, i32), [{
>From 9cdd863951e3a21babb7a5b6e803a32e3c8ef25d Mon Sep 17 00:00:00 2001
From: Luke Quinn <quic_lquinn at quicinc.com>
Date: Mon, 27 Jan 2025 18:05:35 -0800
Subject: [PATCH 3/3] [RISCV] Add hasAllNBitUsers Functional change, change
allows for the generation of packw instructions along with other generic
instructions with narrow w type.
Signed-off-by: Luke Quinn <quic_lquinn at quicinc.com>
---
.../RISCV/GISel/RISCVInstructionSelector.cpp | 90 +++++++++++++-
llvm/lib/Target/RISCV/RISCVInstrInfo.td | 8 +-
llvm/test/CodeGen/RISCV/GlobalISel/combine.ll | 2 +-
.../RISCV/GlobalISel/div-by-constant.ll | 6 +-
.../CodeGen/RISCV/GlobalISel/rotl-rotr.ll | 116 +++++++++---------
.../CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll | 16 +--
llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll | 76 ++++++------
.../test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll | 15 +--
llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll | 14 +--
...lar-shift-by-byte-multiple-legalization.ll | 66 +++++-----
10 files changed, 240 insertions(+), 169 deletions(-)
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 5ea9f728ff995eb..e42aa5bf1b66562 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -57,11 +57,20 @@ class RISCVInstructionSelector : public InstructionSelector {
const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) const;
+ static constexpr unsigned MaxRecursionDepth = 6;
+
// const MachineInstr &MI
- bool hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth = 0) const;
- bool hasAllBUsers(const MachineInstr &MI) const { return hasAllNBitUsers(MI, 8); }
- bool hasAllHUsers(const MachineInstr &MI) const { return hasAllNBitUsers(MI, 16); }
- bool hasAllWUsers(const MachineInstr &MI) const { return hasAllNBitUsers(MI, 32); }
+ bool hasAllNBitUsers(const MachineInstr &MI, unsigned Bits,
+ const unsigned Depth = 0) const;
+ bool hasAllBUsers(const MachineInstr &MI) const {
+ return hasAllNBitUsers(MI, 8);
+ }
+ bool hasAllHUsers(const MachineInstr &MI) const {
+ return hasAllNBitUsers(MI, 16);
+ }
+ bool hasAllWUsers(const MachineInstr &MI) const {
+ return hasAllNBitUsers(MI, 32);
+ }
bool isRegInGprb(Register Reg) const;
bool isRegInFprb(Register Reg) const;
@@ -192,9 +201,78 @@ RISCVInstructionSelector::RISCVInstructionSelector(
{
}
-bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth) const {
+// Mimics optimizations in ISel and RISCVOptWInst Pass
+bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI,
+ unsigned Bits,
+ const unsigned Depth) const {
+
+ assert((MI.getOpcode() == TargetOpcode::G_ADD ||
+ MI.getOpcode() == TargetOpcode::G_SUB ||
+ MI.getOpcode() == TargetOpcode::G_MUL ||
+ MI.getOpcode() == TargetOpcode::G_SHL ||
+ MI.getOpcode() == TargetOpcode::G_LSHR ||
+ MI.getOpcode() == TargetOpcode::G_AND ||
+ MI.getOpcode() == TargetOpcode::G_OR ||
+ MI.getOpcode() == TargetOpcode::G_XOR ||
+ MI.getOpcode() == TargetOpcode::G_SEXT_INREG || Depth != 0) &&
+ "Unexpected opcode");
+
+ if (Depth >= RISCVInstructionSelector::MaxRecursionDepth)
return false;
-};
+
+ auto DestReg = MI.getOperand(0).getReg();
+ for (auto &UserOp : MRI->use_nodbg_operands(DestReg)) {
+ assert(UserOp.getParent() && "UserOp must have a parent");
+ const MachineInstr &UserMI = *UserOp.getParent();
+ unsigned OpIdx = UserOp.getOperandNo();
+
+ switch (UserMI.getOpcode()) {
+ default:
+ return false;
+ case RISCV::ADDW:
+ case RISCV::ADDIW:
+ case RISCV::SUBW:
+ if (Bits >= 32)
+ break;
+ return false;
+ case RISCV::SLL:
+ case RISCV::SRA:
+ case RISCV::SRL:
+ // Shift amount operands only use log2(Xlen) bits.
+ if (OpIdx == 2 && Bits >= Log2_32(Subtarget->getXLen()))
+ break;
+ return false;
+ case RISCV::SLLI:
+ // SLLI only uses the lower (XLen - ShAmt) bits.
+ if (Bits >= Subtarget->getXLen() - UserMI.getOperand(2).getImm())
+ break;
+ return false;
+ case RISCV::ANDI:
+ if (Bits >= (unsigned)llvm::bit_width<uint64_t>(
+ (uint64_t)UserMI.getOperand(2).getImm()))
+ break;
+ goto RecCheck;
+ case RISCV::AND:
+ case RISCV::OR:
+ case RISCV::XOR:
+ RecCheck:
+ if (hasAllNBitUsers(UserMI, Bits, Depth + 1))
+ break;
+ return false;
+ case RISCV::SRLI: {
+ unsigned ShAmt = UserMI.getOperand(2).getImm();
+ // If we are shifting right by less than Bits, and users don't demand any
+ // bits that were shifted into [Bits-1:0], then we can consider this as an
+ // N-Bit user.
+ if (Bits > ShAmt && hasAllNBitUsers(UserMI, Bits - ShAmt, Depth + 1))
+ break;
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
InstructionSelector::ComplexRendererFns
RISCVInstructionSelector::selectShiftMask(MachineOperand &Root,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index fbfc354daa2f29a..00458a3b54c82a7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1943,17 +1943,17 @@ def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
class binop_allhusers<SDPatternOperator operator>
: PatFrag<(ops node:$lhs, node:$rhs),
- (XLenVT (operator node:$lhs, node:$rhs)), [{
+ (XLenVT(operator node:$lhs, node:$rhs)), [{
return hasAllHUsers(Node);
}]> {
- let GISelPredicateCode = [{ return hasAllHUsers(MI); }];
+ let GISelPredicateCode = [{ return hasAllHUsers(MI); }];
}
// PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl
// if only the lower 32 bits of their result is used.
class binop_allwusers<SDPatternOperator operator>
- : PatFrag<(ops node:$lhs, node:$rhs),
- (i64 (operator node:$lhs, node:$rhs)), [{
+ : PatFrag<(ops node:$lhs, node:$rhs), (i64(operator node:$lhs, node:$rhs)),
+ [{
return hasAllWUsers(Node);
}]> {
let GISelPredicateCode = [{ return hasAllWUsers(MI); }];
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
index 360e84d37ec8584..61d1fa5a5b9f4b7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
@@ -20,7 +20,7 @@ define i32 @constant_to_rhs(i32 %x) {
; RV64-O0: # %bb.0:
; RV64-O0-NEXT: mv a1, a0
; RV64-O0-NEXT: li a0, 1
-; RV64-O0-NEXT: add a0, a0, a1
+; RV64-O0-NEXT: addw a0, a0, a1
; RV64-O0-NEXT: sext.w a0, a0
; RV64-O0-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
index e3616a79add9f29..f62902cdd14d9e6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -66,7 +66,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IM-NEXT: srli a2, a2, 32
; RV64IM-NEXT: mul a1, a2, a1
; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: sub a0, a0, a1
+; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: srliw a0, a0, 1
; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: srliw a0, a0, 2
@@ -79,7 +79,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IMZB-NEXT: zext.w a2, a0
; RV64IMZB-NEXT: mul a1, a2, a1
; RV64IMZB-NEXT: srli a1, a1, 32
-; RV64IMZB-NEXT: sub a0, a0, a1
+; RV64IMZB-NEXT: subw a0, a0, a1
; RV64IMZB-NEXT: srliw a0, a0, 1
; RV64IMZB-NEXT: add a0, a0, a1
; RV64IMZB-NEXT: srliw a0, a0, 2
@@ -265,7 +265,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV64-NEXT: andi a2, a0, 255
; RV64-NEXT: mul a1, a2, a1
; RV64-NEXT: srli a1, a1, 8
-; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: subw a0, a0, a1
; RV64-NEXT: andi a0, a0, 255
; RV64-NEXT: srli a0, a0, 1
; RV64-NEXT: add a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
index 46d1661983c6acb..8a786fc9993d2f5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
@@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_32:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -55,7 +55,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_32:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: sllw a1, a0, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_32:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -104,7 +104,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_32:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: srlw a1, a0, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -167,7 +167,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotl_64:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -276,7 +276,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_64:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -340,7 +340,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotr_64:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -451,7 +451,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_64:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -474,7 +474,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_32_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -490,7 +490,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64ZBB-LABEL: rotl_32_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: negw a2, a1
; RV64ZBB-NEXT: sllw a1, a0, a1
; RV64ZBB-NEXT: srlw a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -506,7 +506,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_32_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: sllw a1, a0, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -531,7 +531,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64I-LABEL: rotl_32_mask_and_63_and_31:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a2, a0, a1
-; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: negw a1, a1
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -547,7 +547,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: sllw a2, a0, a1
-; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: negw a1, a1
; RV64ZBB-NEXT: srlw a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -563,7 +563,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a2, a0, a1
-; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: negw a1, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -632,7 +632,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_32_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -648,7 +648,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64ZBB-LABEL: rotr_32_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: negw a2, a1
; RV64ZBB-NEXT: srlw a1, a0, a1
; RV64ZBB-NEXT: sllw a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -664,7 +664,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_32_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: srlw a1, a0, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -689,7 +689,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64I-LABEL: rotr_32_mask_and_63_and_31:
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a2, a0, a1
-; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: negw a1, a1
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -705,7 +705,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: srlw a2, a0, a1
-; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: negw a1, a1
; RV64ZBB-NEXT: sllw a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -721,7 +721,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a2, a0, a1
-; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: negw a1, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -829,7 +829,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotl_64_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -884,7 +884,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64ZBB-LABEL: rotl_64_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: negw a2, a1
; RV64ZBB-NEXT: sll a1, a0, a1
; RV64ZBB-NEXT: srl a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -939,7 +939,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_64_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -1005,7 +1005,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64I-LABEL: rotl_64_mask_and_127_and_63:
; RV64I: # %bb.0:
; RV64I-NEXT: sll a2, a0, a1
-; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: negw a1, a1
; RV64I-NEXT: srl a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -1062,7 +1062,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: sll a2, a0, a1
-; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: negw a1, a1
; RV64ZBB-NEXT: srl a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -1119,7 +1119,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a2, a0, a1
-; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: negw a1, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1277,7 +1277,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotr_64_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -1331,7 +1331,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64ZBB-LABEL: rotr_64_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: negw a2, a1
; RV64ZBB-NEXT: srl a1, a0, a1
; RV64ZBB-NEXT: sll a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -1385,7 +1385,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_64_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -1451,7 +1451,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64I-LABEL: rotr_64_mask_and_127_and_63:
; RV64I: # %bb.0:
; RV64I-NEXT: srl a2, a0, a1
-; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: negw a1, a1
; RV64I-NEXT: sll a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -1508,7 +1508,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: srl a2, a0, a1
-; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: negw a1, a1
; RV64ZBB-NEXT: sll a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -1565,7 +1565,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a2, a0, a1
-; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: negw a1, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1701,7 +1701,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: sllw a4, a0, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: srlw a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sllw a1, a1, a2
@@ -1737,7 +1737,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: sllw a4, a0, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: srlw a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sllw a1, a1, a2
@@ -1822,7 +1822,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: sll a4, a0, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: srl a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sll a1, a1, a2
@@ -1972,7 +1972,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: sll a4, a0, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: srl a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sll a1, a1, a2
@@ -2002,7 +2002,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: srlw a4, a0, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: sllw a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sllw a1, a1, a2
@@ -2038,7 +2038,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: srlw a4, a0, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: sllw a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sllw a1, a1, a2
@@ -2125,7 +2125,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: srl a4, a0, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: sll a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sll a1, a1, a2
@@ -2279,7 +2279,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: srl a4, a0, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: sll a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sll a1, a1, a2
@@ -2312,8 +2312,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: sllw a4, a0, a2
; RV64I-NEXT: sllw a2, a1, a2
-; RV64I-NEXT: neg a5, a3
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: srlw a0, a0, a5
; RV64I-NEXT: srlw a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2353,8 +2353,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: sllw a4, a0, a2
; RV64XTHEADBB-NEXT: sllw a2, a1, a2
-; RV64XTHEADBB-NEXT: neg a5, a3
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a5, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: srlw a0, a0, a5
; RV64XTHEADBB-NEXT: srlw a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2464,7 +2464,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: sll a4, a0, a2
; RV64I-NEXT: sll a2, a1, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: srl a0, a0, a3
; RV64I-NEXT: srl a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2664,7 +2664,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: sll a4, a0, a2
; RV64XTHEADBB-NEXT: sll a2, a1, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: srl a0, a0, a3
; RV64XTHEADBB-NEXT: srl a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2697,8 +2697,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: srlw a4, a0, a2
; RV64I-NEXT: srlw a2, a1, a2
-; RV64I-NEXT: neg a5, a3
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: sllw a0, a0, a5
; RV64I-NEXT: sllw a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2738,8 +2738,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: srlw a4, a0, a2
; RV64XTHEADBB-NEXT: srlw a2, a1, a2
-; RV64XTHEADBB-NEXT: neg a5, a3
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a5, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: sllw a0, a0, a5
; RV64XTHEADBB-NEXT: sllw a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2850,7 +2850,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: srl a4, a0, a2
; RV64I-NEXT: srl a2, a1, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: sll a0, a0, a3
; RV64I-NEXT: sll a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -3052,7 +3052,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: srl a4, a0, a2
; RV64XTHEADBB-NEXT: srl a2, a1, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: sll a0, a0, a3
; RV64XTHEADBB-NEXT: sll a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -3116,7 +3116,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64I-LABEL: rotl_64_zext:
; RV64I: # %bb.0:
; RV64I-NEXT: li a2, 64
-; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: subw a2, a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -3171,7 +3171,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotl_64_zext:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a2, 64
-; RV64ZBB-NEXT: sub a2, a2, a1
+; RV64ZBB-NEXT: subw a2, a2, a1
; RV64ZBB-NEXT: sll a1, a0, a1
; RV64ZBB-NEXT: srl a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -3226,7 +3226,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_zext:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: li a2, 64
-; RV64XTHEADBB-NEXT: sub a2, a2, a1
+; RV64XTHEADBB-NEXT: subw a2, a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -3289,7 +3289,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64I-LABEL: rotr_64_zext:
; RV64I: # %bb.0:
; RV64I-NEXT: li a2, 64
-; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: subw a2, a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -3343,7 +3343,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotr_64_zext:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a2, 64
-; RV64ZBB-NEXT: sub a2, a2, a1
+; RV64ZBB-NEXT: subw a2, a2, a1
; RV64ZBB-NEXT: srl a1, a0, a1
; RV64ZBB-NEXT: sll a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -3397,7 +3397,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_zext:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: li a2, 64
-; RV64XTHEADBB-NEXT: sub a2, a2, a1
+; RV64XTHEADBB-NEXT: subw a2, a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
index a29219bfde06bb6..79d08772e885320 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
@@ -107,7 +107,7 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: rol_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -125,7 +125,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
; RV64I-LABEL: rol_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a3, a1
+; RV64I-NEXT: negw a3, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a3
; RV64I-NEXT: or a0, a1, a0
@@ -146,7 +146,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
; RV64I-LABEL: rol_i32_neg_constant_rhs:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, -2
-; RV64I-NEXT: neg a2, a0
+; RV64I-NEXT: negw a2, a0
; RV64I-NEXT: sllw a0, a1, a0
; RV64I-NEXT: srlw a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
@@ -166,7 +166,7 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
define i64 @rol_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: rol_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -185,7 +185,7 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: ror_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -203,7 +203,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
; RV64I-LABEL: ror_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a3, a1
+; RV64I-NEXT: negw a3, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a3
; RV64I-NEXT: or a0, a1, a0
@@ -224,7 +224,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
; RV64I-LABEL: ror_i32_neg_constant_rhs:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, -2
-; RV64I-NEXT: neg a2, a0
+; RV64I-NEXT: negw a2, a0
; RV64I-NEXT: srlw a0, a1, a0
; RV64I-NEXT: sllw a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
@@ -244,7 +244,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64)
define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: ror_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 9df319e73a11a3d..9a6c718703a27aa 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -31,13 +31,13 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -88,13 +88,13 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -103,7 +103,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srliw a0, a0, 24
; RV64I-NEXT: li a1, 32
-; RV64I-NEXT: sub a0, a1, a0
+; RV64I-NEXT: subw a0, a1, a0
; RV64I-NEXT: j .LBB1_3
; RV64I-NEXT: .LBB1_2:
; RV64I-NEXT: li a0, 32
@@ -153,13 +153,13 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -168,7 +168,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srliw a0, a0, 24
; RV64I-NEXT: li a1, 32
-; RV64I-NEXT: sub a1, a1, a0
+; RV64I-NEXT: subw a1, a1, a0
; RV64I-NEXT: .LBB2_2: # %cond.end
; RV64I-NEXT: subw a0, s0, a1
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -212,13 +212,13 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -283,13 +283,13 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -412,13 +412,13 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -455,13 +455,13 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -497,13 +497,13 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -553,13 +553,13 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -672,13 +672,13 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -709,13 +709,13 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -750,13 +750,13 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
index bf430c618afca25..558424b53be951d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
@@ -16,9 +16,7 @@ define signext i32 @pack_i32(i32 signext %a, i32 signext %b) nounwind {
;
; RV64ZBKB-LABEL: pack_i32:
; RV64ZBKB: # %bb.0:
-; RV64ZBKB-NEXT: zext.h a0, a0
-; RV64ZBKB-NEXT: slliw a1, a1, 16
-; RV64ZBKB-NEXT: or a0, a1, a0
+; RV64ZBKB-NEXT: packw a0, a0, a1
; RV64ZBKB-NEXT: ret
%shl = and i32 %a, 65535
%shl1 = shl i32 %b, 16
@@ -37,9 +35,7 @@ define signext i32 @pack_i32_2(i16 zeroext %a, i16 zeroext %b) nounwind {
;
; RV64ZBKB-LABEL: pack_i32_2:
; RV64ZBKB: # %bb.0:
-; RV64ZBKB-NEXT: slli a1, a1, 16
-; RV64ZBKB-NEXT: or a0, a1, a0
-; RV64ZBKB-NEXT: sext.w a0, a0
+; RV64ZBKB-NEXT: packw a0, a0, a1
; RV64ZBKB-NEXT: ret
%zexta = zext i16 %a to i32
%zextb = zext i16 %b to i32
@@ -60,8 +56,7 @@ define signext i32 @pack_i32_3(i16 zeroext %0, i16 zeroext %1, i32 signext %2) {
;
; RV64ZBKB-LABEL: pack_i32_3:
; RV64ZBKB: # %bb.0:
-; RV64ZBKB-NEXT: slli a0, a0, 16
-; RV64ZBKB-NEXT: or a0, a0, a1
+; RV64ZBKB-NEXT: packw a0, a1, a0
; RV64ZBKB-NEXT: addw a0, a0, a2
; RV64ZBKB-NEXT: ret
%4 = zext i16 %0 to i32
@@ -343,9 +338,7 @@ define signext i32 @pack_i32_allWUsers(i16 zeroext %0, i16 zeroext %1, i16 zeroe
; RV64ZBKB: # %bb.0:
; RV64ZBKB-NEXT: add a0, a1, a0
; RV64ZBKB-NEXT: zext.h a0, a0
-; RV64ZBKB-NEXT: slli a0, a0, 16
-; RV64ZBKB-NEXT: or a0, a0, a2
-; RV64ZBKB-NEXT: sext.w a0, a0
+; RV64ZBKB-NEXT: packw a0, a2, a0
; RV64ZBKB-NEXT: ret
%4 = add i16 %1, %0
%5 = zext i16 %4 to i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
index 71a5ecc77a1b08b..8b262db56ccd278 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
@@ -330,7 +330,7 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: li a3, 64
; RV64I-NEXT: bltu a2, a3, .LBB6_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a4, a2, a3
+; RV64I-NEXT: subw a4, a2, a3
; RV64I-NEXT: srl a4, a1, a4
; RV64I-NEXT: bnez a2, .LBB6_3
; RV64I-NEXT: j .LBB6_4
@@ -476,7 +476,7 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: li a3, 64
; RV64I-NEXT: bltu a2, a3, .LBB7_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a4, a2, a3
+; RV64I-NEXT: subw a4, a2, a3
; RV64I-NEXT: sra a4, a1, a4
; RV64I-NEXT: bnez a2, .LBB7_3
; RV64I-NEXT: j .LBB7_4
@@ -615,7 +615,7 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: bltu a2, a4, .LBB8_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a0, 0
-; RV64I-NEXT: sub a4, a2, a4
+; RV64I-NEXT: subw a4, a2, a4
; RV64I-NEXT: sll a3, a3, a4
; RV64I-NEXT: bnez a2, .LBB8_3
; RV64I-NEXT: j .LBB8_4
@@ -685,7 +685,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
;
; RV64I-LABEL: fshr64_minsize:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -914,7 +914,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: li a4, 64
; RV64I-NEXT: bltu a5, a4, .LBB10_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a3, a5, a4
+; RV64I-NEXT: subw a3, a5, a4
; RV64I-NEXT: srl a6, a1, a3
; RV64I-NEXT: j .LBB10_3
; RV64I-NEXT: .LBB10_2:
@@ -928,7 +928,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: # %bb.4:
; RV64I-NEXT: mv a3, a6
; RV64I-NEXT: .LBB10_5:
-; RV64I-NEXT: neg a7, a2
+; RV64I-NEXT: negw a7, a2
; RV64I-NEXT: bltu a5, a4, .LBB10_7
; RV64I-NEXT: # %bb.6:
; RV64I-NEXT: li a2, 0
@@ -940,7 +940,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: bltu a6, a4, .LBB10_10
; RV64I-NEXT: # %bb.9:
; RV64I-NEXT: li a5, 0
-; RV64I-NEXT: sub a4, a6, a4
+; RV64I-NEXT: subw a4, a6, a4
; RV64I-NEXT: sll a0, a0, a4
; RV64I-NEXT: bnez a6, .LBB10_11
; RV64I-NEXT: j .LBB10_12
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
index 4ede69324289895..bc002fee4417c53 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -758,7 +758,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB6_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: subw a5, a1, a4
; RV64I-NEXT: srl a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB6_3
; RV64I-NEXT: j .LBB6_4
@@ -1091,7 +1091,7 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB7_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: subw a5, a1, a4
; RV64I-NEXT: srl a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB7_3
; RV64I-NEXT: j .LBB7_4
@@ -1425,7 +1425,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu a3, a5, .LBB8_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a1, 0
-; RV64I-NEXT: sub a5, a3, a5
+; RV64I-NEXT: subw a5, a3, a5
; RV64I-NEXT: sll a4, a4, a5
; RV64I-NEXT: bnez a3, .LBB8_3
; RV64I-NEXT: j .LBB8_4
@@ -1754,7 +1754,7 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: bltu a3, a5, .LBB9_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a1, 0
-; RV64I-NEXT: sub a5, a3, a5
+; RV64I-NEXT: subw a5, a3, a5
; RV64I-NEXT: sll a4, a4, a5
; RV64I-NEXT: bnez a3, .LBB9_3
; RV64I-NEXT: j .LBB9_4
@@ -2083,7 +2083,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB10_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: subw a5, a1, a4
; RV64I-NEXT: sra a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB10_3
; RV64I-NEXT: j .LBB10_4
@@ -2416,7 +2416,7 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB11_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: subw a5, a1, a4
; RV64I-NEXT: sra a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB11_3
; RV64I-NEXT: j .LBB11_4
@@ -2796,7 +2796,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 3
-; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: subw t1, a5, a7
; RV64I-NEXT: negw t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB12_2
@@ -2851,7 +2851,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu t6, a7, .LBB12_12
; RV64I-NEXT: .LBB12_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: subw t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB12_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -2862,7 +2862,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB12_17:
; RV64I-NEXT: bltu s0, a7, .LBB12_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: subw t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB12_20
; RV64I-NEXT: j .LBB12_21
@@ -3720,7 +3720,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 5
-; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: subw t1, a5, a7
; RV64I-NEXT: negw t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB13_2
@@ -3775,7 +3775,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: bltu t6, a7, .LBB13_12
; RV64I-NEXT: .LBB13_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: subw t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB13_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -3786,7 +3786,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: .LBB13_17:
; RV64I-NEXT: bltu s0, a7, .LBB13_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: subw t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB13_20
; RV64I-NEXT: j .LBB13_21
@@ -4644,7 +4644,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 6
-; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: subw t1, a5, a7
; RV64I-NEXT: negw t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB14_2
@@ -4699,7 +4699,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: bltu t6, a7, .LBB14_12
; RV64I-NEXT: .LBB14_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: subw t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB14_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -4710,7 +4710,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: .LBB14_17:
; RV64I-NEXT: bltu s0, a7, .LBB14_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: subw t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB14_20
; RV64I-NEXT: j .LBB14_21
@@ -5542,7 +5542,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 3
-; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: subw t2, a6, t0
; RV64I-NEXT: negw t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB15_2
@@ -5585,7 +5585,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB15_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: subw s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB15_8
; RV64I-NEXT: .LBB15_7:
@@ -5637,7 +5637,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu s0, t0, .LBB15_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: subw t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB15_21
; RV64I-NEXT: j .LBB15_22
@@ -6456,7 +6456,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 5
-; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: subw t2, a6, t0
; RV64I-NEXT: negw t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB16_2
@@ -6499,7 +6499,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB16_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: subw s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB16_8
; RV64I-NEXT: .LBB16_7:
@@ -6551,7 +6551,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: bltu s0, t0, .LBB16_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: subw t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB16_21
; RV64I-NEXT: j .LBB16_22
@@ -7370,7 +7370,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 6
-; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: subw t2, a6, t0
; RV64I-NEXT: negw t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB17_2
@@ -7413,7 +7413,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB17_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: subw s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB17_8
; RV64I-NEXT: .LBB17_7:
@@ -7465,7 +7465,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: bltu s0, t0, .LBB17_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: subw t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB17_21
; RV64I-NEXT: j .LBB17_22
@@ -8310,7 +8310,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 3
-; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: subw t1, a6, t0
; RV64I-NEXT: negw t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB18_2
@@ -8365,7 +8365,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu t6, t0, .LBB18_12
; RV64I-NEXT: .LBB18_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: subw t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB18_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -8376,7 +8376,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB18_17:
; RV64I-NEXT: bltu s0, t0, .LBB18_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: subw t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB18_20
; RV64I-NEXT: j .LBB18_21
@@ -9241,7 +9241,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 5
-; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: subw t1, a6, t0
; RV64I-NEXT: negw t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB19_2
@@ -9296,7 +9296,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: bltu t6, t0, .LBB19_12
; RV64I-NEXT: .LBB19_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: subw t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB19_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -9307,7 +9307,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: .LBB19_17:
; RV64I-NEXT: bltu s0, t0, .LBB19_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: subw t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB19_20
; RV64I-NEXT: j .LBB19_21
@@ -10172,7 +10172,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 6
-; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: subw t1, a6, t0
; RV64I-NEXT: negw t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB20_2
@@ -10227,7 +10227,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: bltu t6, t0, .LBB20_12
; RV64I-NEXT: .LBB20_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: subw t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB20_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -10238,7 +10238,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: .LBB20_17:
; RV64I-NEXT: bltu s0, t0, .LBB20_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: subw t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB20_20
; RV64I-NEXT: j .LBB20_21
More information about the llvm-commits
mailing list