[llvm] [RISCV] Porting hasAllNBitUsers to RISCV GISel for instruction select (PR #125795)
Luke Quinn via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 4 18:10:19 PST 2025
https://github.com/lquinn2015 created https://github.com/llvm/llvm-project/pull/125795
Ported hasAllNBitUsers to RISCV GISel side. Add GISelPredicate code to each of the 16,32, and 64 bit words. It allows for generation of optimized packw sequences along with other transparent narrowing operations. Included a few new .ll files to expand testing and limited the OptW pass Optimization to fewer options until GISel is ready for more code generation paths
>From 99b2489b88e48d3f63015a29ba669641f8f06e64 Mon Sep 17 00:00:00 2001
From: Luke Quinn <quic_lquinn at quicinc.com>
Date: Mon, 3 Feb 2025 05:54:51 -0800
Subject: [PATCH 1/3] [RISCV] PreTest: RISCVOptWInst Pass for GIsel, Add 3
tests required to show pre checks
Signed-off-by: Luke Quinn <quic_lquinn at quicinc.com>
---
.../RISCV/GlobalISel/div-by-constant.ll | 811 ++
.../CodeGen/RISCV/GlobalISel/rotl-rotr.ll | 3412 +++++
llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll | 962 ++
...lar-shift-by-byte-multiple-legalization.ll | 10982 ++++++++++++++++
4 files changed, 16167 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
new file mode 100644
index 00000000000000..e3616a79add9f2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -0,0 +1,811 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck -check-prefixes=RV32,RV32IM %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zba,+zbb \
+; RUN: -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck -check-prefixes=RV32,RV32IMZB %s
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck -check-prefixes=RV64,RV64IM %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zba,+zbb \
+; RUN: -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck -check-prefixes=RV64,RV64IMZB %s
+
+; Test that there is a single shift after the mul and no addition.
+define i32 @udiv_constant_no_add(i32 %a) nounwind {
+; RV32-LABEL: udiv_constant_no_add:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a1, 838861
+; RV32-NEXT: addi a1, a1, -819
+; RV32-NEXT: mulhu a0, a0, a1
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: ret
+;
+; RV64IM-LABEL: udiv_constant_no_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: lui a1, 205
+; RV64IM-NEXT: srli a0, a0, 32
+; RV64IM-NEXT: addiw a1, a1, -819
+; RV64IM-NEXT: slli a1, a1, 12
+; RV64IM-NEXT: addi a1, a1, -819
+; RV64IM-NEXT: mul a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 34
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv_constant_no_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: zext.w a0, a0
+; RV64IMZB-NEXT: lui a1, 838861
+; RV64IMZB-NEXT: addi a1, a1, -819
+; RV64IMZB-NEXT: zext.w a1, a1
+; RV64IMZB-NEXT: mul a0, a0, a1
+; RV64IMZB-NEXT: srli a0, a0, 34
+; RV64IMZB-NEXT: ret
+ %1 = udiv i32 %a, 5
+ ret i32 %1
+}
+
+; This constant requires a sub, shrli, add sequence after the mul.
+define i32 @udiv_constant_add(i32 %a) nounwind {
+; RV32-LABEL: udiv_constant_add:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a1, 149797
+; RV32-NEXT: addi a1, a1, -1755
+; RV32-NEXT: mulhu a1, a0, a1
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: srli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: ret
+;
+; RV64IM-LABEL: udiv_constant_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: lui a1, 149797
+; RV64IM-NEXT: slli a2, a0, 32
+; RV64IM-NEXT: addiw a1, a1, -1755
+; RV64IM-NEXT: srli a2, a2, 32
+; RV64IM-NEXT: mul a1, a2, a1
+; RV64IM-NEXT: srli a1, a1, 32
+; RV64IM-NEXT: sub a0, a0, a1
+; RV64IM-NEXT: srliw a0, a0, 1
+; RV64IM-NEXT: add a0, a0, a1
+; RV64IM-NEXT: srliw a0, a0, 2
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv_constant_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: lui a1, 149797
+; RV64IMZB-NEXT: addiw a1, a1, -1755
+; RV64IMZB-NEXT: zext.w a2, a0
+; RV64IMZB-NEXT: mul a1, a2, a1
+; RV64IMZB-NEXT: srli a1, a1, 32
+; RV64IMZB-NEXT: sub a0, a0, a1
+; RV64IMZB-NEXT: srliw a0, a0, 1
+; RV64IMZB-NEXT: add a0, a0, a1
+; RV64IMZB-NEXT: srliw a0, a0, 2
+; RV64IMZB-NEXT: ret
+ %1 = udiv i32 %a, 7
+ ret i32 %1
+}
+
+define i64 @udiv64_constant_no_add(i64 %a) nounwind {
+; RV32-LABEL: udiv64_constant_no_add:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 838861
+; RV32-NEXT: mulhu a3, a0, zero
+; RV32-NEXT: addi a4, a2, -819
+; RV32-NEXT: addi a2, a2, -820
+; RV32-NEXT: mul a5, a1, a4
+; RV32-NEXT: mul a6, a0, a2
+; RV32-NEXT: mulhu a7, a0, a4
+; RV32-NEXT: mul t0, zero, a4
+; RV32-NEXT: mul t1, a1, a2
+; RV32-NEXT: mulhu t2, a1, a4
+; RV32-NEXT: mulhu a0, a0, a2
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: mul a2, zero, a2
+; RV32-NEXT: mulhu a4, zero, a4
+; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add a2, t0, a2
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: add a1, a4, a1
+; RV32-NEXT: sltu a4, a5, a6
+; RV32-NEXT: add a5, a5, a7
+; RV32-NEXT: sltu a6, t0, t1
+; RV32-NEXT: sltiu t1, t0, 0
+; RV32-NEXT: add t0, t0, t2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: sltu a2, a5, a7
+; RV32-NEXT: add a6, a6, t1
+; RV32-NEXT: sltu a5, t0, t2
+; RV32-NEXT: add t0, t0, a0
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: add a2, a4, a2
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: sltu a0, t0, a0
+; RV32-NEXT: add a0, a5, a0
+; RV32-NEXT: add t0, t0, a2
+; RV32-NEXT: sltu a2, t0, a2
+; RV32-NEXT: srli a3, t0, 2
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a1, 30
+; RV32-NEXT: or a0, a3, a0
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv64_constant_no_add:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 1035469
+; RV64-NEXT: addi a1, a1, -819
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -819
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -819
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -819
+; RV64-NEXT: mulhu a0, a0, a1
+; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: ret
+ %1 = udiv i64 %a, 5
+ ret i64 %1
+}
+
+define i64 @udiv64_constant_add(i64 %a) nounwind {
+; RV32-LABEL: udiv64_constant_add:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 599186
+; RV32-NEXT: lui a3, 149797
+; RV32-NEXT: mulhu a4, a0, zero
+; RV32-NEXT: addi a2, a2, 1171
+; RV32-NEXT: addi a3, a3, -1756
+; RV32-NEXT: mul a5, a1, a2
+; RV32-NEXT: mul a6, a0, a3
+; RV32-NEXT: mulhu a7, a0, a2
+; RV32-NEXT: mul t0, zero, a2
+; RV32-NEXT: mulhu t1, zero, a2
+; RV32-NEXT: mulhu t2, a1, a3
+; RV32-NEXT: add t1, t1, t2
+; RV32-NEXT: mul t2, zero, a3
+; RV32-NEXT: add t2, t0, t2
+; RV32-NEXT: add t1, t2, t1
+; RV32-NEXT: mul t2, a1, a3
+; RV32-NEXT: mulhu a2, a1, a2
+; RV32-NEXT: mulhu a3, a0, a3
+; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add t0, t0, t2
+; RV32-NEXT: sltu a6, a5, a6
+; RV32-NEXT: add a5, a5, a7
+; RV32-NEXT: sltu t2, t0, t2
+; RV32-NEXT: sltu a5, a5, a7
+; RV32-NEXT: sltiu a7, t0, 0
+; RV32-NEXT: add t0, t0, a2
+; RV32-NEXT: add a7, t2, a7
+; RV32-NEXT: sltu a2, t0, a2
+; RV32-NEXT: add t0, t0, a3
+; RV32-NEXT: add a4, t1, a4
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: add a2, a7, a2
+; RV32-NEXT: sltu a3, t0, a3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add t0, t0, a5
+; RV32-NEXT: sltu a3, t0, a5
+; RV32-NEXT: sub a5, a0, t0
+; RV32-NEXT: sltu a0, a0, t0
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: srli a5, a5, 1
+; RV32-NEXT: add a2, a4, a2
+; RV32-NEXT: sub a1, a1, a2
+; RV32-NEXT: slli a0, a1, 31
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: or a0, a5, a0
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a0, a0, t0
+; RV32-NEXT: sltu a2, a0, t0
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 30
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv64_constant_add:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a1, %hi(.LCPI3_0)
+; RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
+; RV64-NEXT: mulhu a1, a0, a1
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: srli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: ret
+ %1 = udiv i64 %a, 7
+ ret i64 %1
+}
+
+define i8 @udiv8_constant_no_add(i8 %a) nounwind {
+; RV32-LABEL: udiv8_constant_no_add:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: li a1, 205
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: srli a0, a0, 10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv8_constant_no_add:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: li a1, 205
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 10
+; RV64-NEXT: ret
+ %1 = udiv i8 %a, 5
+ ret i8 %1
+}
+
+define i8 @udiv8_constant_add(i8 %a) nounwind {
+; RV32-LABEL: udiv8_constant_add:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 37
+; RV32-NEXT: andi a2, a0, 255
+; RV32-NEXT: mul a1, a2, a1
+; RV32-NEXT: srli a1, a1, 8
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: srli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv8_constant_add:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 37
+; RV64-NEXT: andi a2, a0, 255
+; RV64-NEXT: mul a1, a2, a1
+; RV64-NEXT: srli a1, a1, 8
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: srli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: andi a0, a0, 255
+; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: ret
+ %1 = udiv i8 %a, 7
+ ret i8 %1
+}
+
+define i16 @udiv16_constant_no_add(i16 %a) nounwind {
+; RV32IM-LABEL: udiv16_constant_no_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: lui a1, 13
+; RV32IM-NEXT: srli a0, a0, 16
+; RV32IM-NEXT: addi a1, a1, -819
+; RV32IM-NEXT: mul a0, a0, a1
+; RV32IM-NEXT: srli a0, a0, 18
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv16_constant_no_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: zext.h a0, a0
+; RV32IMZB-NEXT: lui a1, 13
+; RV32IMZB-NEXT: addi a1, a1, -819
+; RV32IMZB-NEXT: mul a0, a0, a1
+; RV32IMZB-NEXT: srli a0, a0, 18
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: udiv16_constant_no_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: lui a1, 13
+; RV64IM-NEXT: srli a0, a0, 48
+; RV64IM-NEXT: addiw a1, a1, -819
+; RV64IM-NEXT: mul a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 18
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv16_constant_no_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: zext.h a0, a0
+; RV64IMZB-NEXT: lui a1, 13
+; RV64IMZB-NEXT: addiw a1, a1, -819
+; RV64IMZB-NEXT: mul a0, a0, a1
+; RV64IMZB-NEXT: srli a0, a0, 18
+; RV64IMZB-NEXT: ret
+ %1 = udiv i16 %a, 5
+ ret i16 %1
+}
+
+define i16 @udiv16_constant_add(i16 %a) nounwind {
+; RV32IM-LABEL: udiv16_constant_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: lui a1, 2
+; RV32IM-NEXT: lui a2, 16
+; RV32IM-NEXT: addi a1, a1, 1171
+; RV32IM-NEXT: addi a2, a2, -1
+; RV32IM-NEXT: and a3, a0, a2
+; RV32IM-NEXT: mul a1, a3, a1
+; RV32IM-NEXT: srli a1, a1, 16
+; RV32IM-NEXT: sub a0, a0, a1
+; RV32IM-NEXT: and a0, a0, a2
+; RV32IM-NEXT: srli a0, a0, 1
+; RV32IM-NEXT: add a0, a0, a1
+; RV32IM-NEXT: and a0, a0, a2
+; RV32IM-NEXT: srli a0, a0, 2
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv16_constant_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: lui a1, 2
+; RV32IMZB-NEXT: addi a1, a1, 1171
+; RV32IMZB-NEXT: zext.h a2, a0
+; RV32IMZB-NEXT: mul a1, a2, a1
+; RV32IMZB-NEXT: srli a1, a1, 16
+; RV32IMZB-NEXT: sub a0, a0, a1
+; RV32IMZB-NEXT: zext.h a0, a0
+; RV32IMZB-NEXT: srli a0, a0, 1
+; RV32IMZB-NEXT: add a0, a0, a1
+; RV32IMZB-NEXT: zext.h a0, a0
+; RV32IMZB-NEXT: srli a0, a0, 2
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: udiv16_constant_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: lui a1, 2
+; RV64IM-NEXT: lui a2, 16
+; RV64IM-NEXT: addiw a1, a1, 1171
+; RV64IM-NEXT: addiw a2, a2, -1
+; RV64IM-NEXT: and a3, a0, a2
+; RV64IM-NEXT: mul a1, a3, a1
+; RV64IM-NEXT: srli a1, a1, 16
+; RV64IM-NEXT: sub a0, a0, a1
+; RV64IM-NEXT: and a0, a0, a2
+; RV64IM-NEXT: srli a0, a0, 1
+; RV64IM-NEXT: add a0, a0, a1
+; RV64IM-NEXT: and a0, a0, a2
+; RV64IM-NEXT: srli a0, a0, 2
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv16_constant_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: lui a1, 2
+; RV64IMZB-NEXT: addi a1, a1, 1171
+; RV64IMZB-NEXT: zext.h a2, a0
+; RV64IMZB-NEXT: mul a1, a2, a1
+; RV64IMZB-NEXT: srli a1, a1, 16
+; RV64IMZB-NEXT: sub a0, a0, a1
+; RV64IMZB-NEXT: zext.h a0, a0
+; RV64IMZB-NEXT: srli a0, a0, 1
+; RV64IMZB-NEXT: add a0, a0, a1
+; RV64IMZB-NEXT: zext.h a0, a0
+; RV64IMZB-NEXT: srli a0, a0, 2
+; RV64IMZB-NEXT: ret
+ %1 = udiv i16 %a, 7
+ ret i16 %1
+}
+
+; Test the simplest case a srli and an add after the mul. No srai.
+define i32 @sdiv_constant_no_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_no_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 3
+; RV32-NEXT: div a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv_constant_no_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 3
+; RV64-NEXT: divw a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i32 %a, 3
+ ret i32 %1
+}
+
+; This constant requires an srai between the mul and the add.
+define i32 @sdiv_constant_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 5
+; RV32-NEXT: div a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv_constant_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 5
+; RV64-NEXT: divw a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i32 %a, 5
+ ret i32 %1
+}
+
+; This constant requires an add and an srai after the mul.
+define i32 @sdiv_constant_add_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_add_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 7
+; RV32-NEXT: div a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv_constant_add_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 7
+; RV64-NEXT: divw a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i32 %a, 7
+ ret i32 %1
+}
+
+; This constant requires a sub and an srai after the mul.
+define i32 @sdiv_constant_sub_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_sub_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, -7
+; RV32-NEXT: div a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv_constant_sub_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, -7
+; RV64-NEXT: divw a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i32 %a, -7
+ ret i32 %1
+}
+
+define i64 @sdiv64_constant_no_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_no_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li a2, 3
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __divdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv64_constant_no_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 3
+; RV64-NEXT: div a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i64 %a, 3
+ ret i64 %1
+}
+
+define i64 @sdiv64_constant_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li a2, 5
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __divdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv64_constant_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 5
+; RV64-NEXT: div a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i64 %a, 5
+ ret i64 %1
+}
+
+define i64 @sdiv64_constant_add_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_add_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li a2, 15
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __divdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv64_constant_add_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 15
+; RV64-NEXT: div a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i64 %a, 15
+ ret i64 %1
+}
+
+define i64 @sdiv64_constant_sub_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_sub_srai:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li a2, -3
+; RV32-NEXT: li a3, -1
+; RV32-NEXT: call __divdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sdiv64_constant_sub_srai:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, -3
+; RV64-NEXT: div a0, a0, a1
+; RV64-NEXT: ret
+ %1 = sdiv i64 %a, -3
+ ret i64 %1
+}
+
+define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_no_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 3
+; RV32IM-NEXT: slli a0, a0, 24
+; RV32IM-NEXT: srai a0, a0, 24
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_no_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 3
+; RV32IMZB-NEXT: sext.b a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv8_constant_no_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 3
+; RV64IM-NEXT: slli a0, a0, 56
+; RV64IM-NEXT: srai a0, a0, 56
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_no_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 3
+; RV64IMZB-NEXT: sext.b a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i8 %a, 3
+ ret i8 %1
+}
+
+define i8 @sdiv8_constant_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 5
+; RV32IM-NEXT: slli a0, a0, 24
+; RV32IM-NEXT: srai a0, a0, 24
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 5
+; RV32IMZB-NEXT: sext.b a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv8_constant_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 5
+; RV64IM-NEXT: slli a0, a0, 56
+; RV64IM-NEXT: srai a0, a0, 56
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 5
+; RV64IMZB-NEXT: sext.b a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i8 %a, 5
+ ret i8 %1
+}
+
+define i8 @sdiv8_constant_add_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_add_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 7
+; RV32IM-NEXT: slli a0, a0, 24
+; RV32IM-NEXT: srai a0, a0, 24
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_add_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 7
+; RV32IMZB-NEXT: sext.b a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv8_constant_add_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 7
+; RV64IM-NEXT: slli a0, a0, 56
+; RV64IM-NEXT: srai a0, a0, 56
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_add_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 7
+; RV64IMZB-NEXT: sext.b a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i8 %a, 7
+ ret i8 %1
+}
+
+define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_sub_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, -7
+; RV32IM-NEXT: slli a0, a0, 24
+; RV32IM-NEXT: srai a0, a0, 24
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_sub_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, -7
+; RV32IMZB-NEXT: sext.b a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv8_constant_sub_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, -7
+; RV64IM-NEXT: slli a0, a0, 56
+; RV64IM-NEXT: srai a0, a0, 56
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_sub_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, -7
+; RV64IMZB-NEXT: sext.b a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i8 %a, -7
+ ret i8 %1
+}
+
+define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_no_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 3
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srai a0, a0, 16
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_no_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 3
+; RV32IMZB-NEXT: sext.h a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv16_constant_no_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 3
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srai a0, a0, 48
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_no_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 3
+; RV64IMZB-NEXT: sext.h a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i16 %a, 3
+ ret i16 %1
+}
+
+define i16 @sdiv16_constant_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 5
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srai a0, a0, 16
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 5
+; RV32IMZB-NEXT: sext.h a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv16_constant_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 5
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srai a0, a0, 48
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 5
+; RV64IMZB-NEXT: sext.h a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i16 %a, 5
+ ret i16 %1
+}
+
+define i16 @sdiv16_constant_add_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_add_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, 15
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srai a0, a0, 16
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_add_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, 15
+; RV32IMZB-NEXT: sext.h a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv16_constant_add_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, 15
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srai a0, a0, 48
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_add_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, 15
+; RV64IMZB-NEXT: sext.h a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i16 %a, 15
+ ret i16 %1
+}
+
+define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_sub_srai:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a1, -15
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srai a0, a0, 16
+; RV32IM-NEXT: div a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_sub_srai:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: li a1, -15
+; RV32IMZB-NEXT: sext.h a0, a0
+; RV32IMZB-NEXT: div a0, a0, a1
+; RV32IMZB-NEXT: ret
+;
+; RV64IM-LABEL: sdiv16_constant_sub_srai:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: li a1, -15
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srai a0, a0, 48
+; RV64IM-NEXT: divw a0, a0, a1
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_sub_srai:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: li a1, -15
+; RV64IMZB-NEXT: sext.h a0, a0
+; RV64IMZB-NEXT: divw a0, a0, a1
+; RV64IMZB-NEXT: ret
+ %1 = sdiv i16 %a, -15
+ ret i16 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
new file mode 100644
index 00000000000000..46d1661983c6ac
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
@@ -0,0 +1,3412 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -enable-legalize-types-checking -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV32ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV64ZBB
+; RUN: llc -mtriple=riscv32 -mattr=+xtheadbb -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV32XTHEADBB
+; RUN: llc -mtriple=riscv64 -mattr=+xtheadbb -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV64XTHEADBB
+
+; NOTE: -enable-legalize-types-checking is on one command line due to a previous
+; assertion failure on an expensive checks build for @rotr_32_mask_multiple.
+
+; These IR sequences are idioms for rotates. If rotate instructions are
+; supported, they will be turned into ISD::ROTL or ISD::ROTR.
+
+define i32 @rotl_32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: neg a2, a1
+; RV32I-NEXT: sll a1, a0, a1
+; RV32I-NEXT: srl a0, a0, a2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: sllw a1, a0, a1
+; RV64I-NEXT: srlw a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: rol a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rolw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: neg a2, a1
+; RV32XTHEADBB-NEXT: sll a1, a0, a1
+; RV32XTHEADBB-NEXT: srl a0, a0, a2
+; RV32XTHEADBB-NEXT: or a0, a1, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: sllw a1, a0, a1
+; RV64XTHEADBB-NEXT: srlw a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 32, %y
+ %b = shl i32 %x, %y
+ %c = lshr i32 %x, %z
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+define i32 @rotr_32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: neg a2, a1
+; RV32I-NEXT: srl a1, a0, a1
+; RV32I-NEXT: sll a0, a0, a2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srlw a1, a0, a1
+; RV64I-NEXT: sllw a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ror a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rorw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: neg a2, a1
+; RV32XTHEADBB-NEXT: srl a1, a0, a1
+; RV32XTHEADBB-NEXT: sll a0, a0, a2
+; RV32XTHEADBB-NEXT: or a0, a1, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: srlw a1, a0, a1
+; RV64XTHEADBB-NEXT: sllw a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 32, %y
+ %b = lshr i32 %x, %y
+ %c = shl i32 %x, %z
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+define i64 @rotl_64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotl_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a6, a2, 63
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a6, a4, .LBB2_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: sll a7, a0, a6
+; RV32I-NEXT: j .LBB2_3
+; RV32I-NEXT: .LBB2_2:
+; RV32I-NEXT: sll a3, a0, a2
+; RV32I-NEXT: neg a5, a6
+; RV32I-NEXT: srl a5, a0, a5
+; RV32I-NEXT: sll a7, a1, a2
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: .LBB2_3:
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: mv a2, a1
+; RV32I-NEXT: beqz a6, .LBB2_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a2, a7
+; RV32I-NEXT: .LBB2_5:
+; RV32I-NEXT: andi a6, a5, 63
+; RV32I-NEXT: bltu a6, a4, .LBB2_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl a7, a1, a6
+; RV32I-NEXT: bnez a6, .LBB2_8
+; RV32I-NEXT: j .LBB2_9
+; RV32I-NEXT: .LBB2_7:
+; RV32I-NEXT: srl a7, a0, a5
+; RV32I-NEXT: neg t0, a6
+; RV32I-NEXT: sll t0, a1, t0
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: beqz a6, .LBB2_9
+; RV32I-NEXT: .LBB2_8:
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: .LBB2_9:
+; RV32I-NEXT: bltu a6, a4, .LBB2_11
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB2_12
+; RV32I-NEXT: .LBB2_11:
+; RV32I-NEXT: srl a1, a1, a5
+; RV32I-NEXT: .LBB2_12:
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: sll a1, a0, a1
+; RV64I-NEXT: srl a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a6, a2, 63
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: bltu a6, a4, .LBB2_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: sll a7, a0, a6
+; RV32ZBB-NEXT: j .LBB2_3
+; RV32ZBB-NEXT: .LBB2_2:
+; RV32ZBB-NEXT: sll a3, a0, a2
+; RV32ZBB-NEXT: neg a5, a6
+; RV32ZBB-NEXT: srl a5, a0, a5
+; RV32ZBB-NEXT: sll a7, a1, a2
+; RV32ZBB-NEXT: or a7, a5, a7
+; RV32ZBB-NEXT: .LBB2_3:
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: mv a2, a1
+; RV32ZBB-NEXT: beqz a6, .LBB2_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a2, a7
+; RV32ZBB-NEXT: .LBB2_5:
+; RV32ZBB-NEXT: andi a6, a5, 63
+; RV32ZBB-NEXT: bltu a6, a4, .LBB2_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl a7, a1, a6
+; RV32ZBB-NEXT: bnez a6, .LBB2_8
+; RV32ZBB-NEXT: j .LBB2_9
+; RV32ZBB-NEXT: .LBB2_7:
+; RV32ZBB-NEXT: srl a7, a0, a5
+; RV32ZBB-NEXT: neg t0, a6
+; RV32ZBB-NEXT: sll t0, a1, t0
+; RV32ZBB-NEXT: or a7, a7, t0
+; RV32ZBB-NEXT: beqz a6, .LBB2_9
+; RV32ZBB-NEXT: .LBB2_8:
+; RV32ZBB-NEXT: mv a0, a7
+; RV32ZBB-NEXT: .LBB2_9:
+; RV32ZBB-NEXT: bltu a6, a4, .LBB2_11
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: j .LBB2_12
+; RV32ZBB-NEXT: .LBB2_11:
+; RV32ZBB-NEXT: srl a1, a1, a5
+; RV32ZBB-NEXT: .LBB2_12:
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: or a1, a2, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rol a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a6, a2, 63
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: sll a7, a0, a6
+; RV32XTHEADBB-NEXT: j .LBB2_3
+; RV32XTHEADBB-NEXT: .LBB2_2:
+; RV32XTHEADBB-NEXT: sll a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a5, a6
+; RV32XTHEADBB-NEXT: srl a5, a0, a5
+; RV32XTHEADBB-NEXT: sll a7, a1, a2
+; RV32XTHEADBB-NEXT: or a7, a5, a7
+; RV32XTHEADBB-NEXT: .LBB2_3:
+; RV32XTHEADBB-NEXT: neg a5, a2
+; RV32XTHEADBB-NEXT: mv a2, a1
+; RV32XTHEADBB-NEXT: beqz a6, .LBB2_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a2, a7
+; RV32XTHEADBB-NEXT: .LBB2_5:
+; RV32XTHEADBB-NEXT: andi a6, a5, 63
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl a7, a1, a6
+; RV32XTHEADBB-NEXT: bnez a6, .LBB2_8
+; RV32XTHEADBB-NEXT: j .LBB2_9
+; RV32XTHEADBB-NEXT: .LBB2_7:
+; RV32XTHEADBB-NEXT: srl a7, a0, a5
+; RV32XTHEADBB-NEXT: neg t0, a6
+; RV32XTHEADBB-NEXT: sll t0, a1, t0
+; RV32XTHEADBB-NEXT: or a7, a7, t0
+; RV32XTHEADBB-NEXT: beqz a6, .LBB2_9
+; RV32XTHEADBB-NEXT: .LBB2_8:
+; RV32XTHEADBB-NEXT: mv a0, a7
+; RV32XTHEADBB-NEXT: .LBB2_9:
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_11
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: j .LBB2_12
+; RV32XTHEADBB-NEXT: .LBB2_11:
+; RV32XTHEADBB-NEXT: srl a1, a1, a5
+; RV32XTHEADBB-NEXT: .LBB2_12:
+; RV32XTHEADBB-NEXT: or a0, a3, a0
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: sll a1, a0, a1
+; RV64XTHEADBB-NEXT: srl a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i64 64, %y
+ %b = shl i64 %x, %y
+ %c = lshr i64 %x, %z
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i64 @rotr_64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotr_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a5, a2, 63
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a5, a4, .LBB3_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a6, a1, a5
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: bnez a5, .LBB3_3
+; RV32I-NEXT: j .LBB3_4
+; RV32I-NEXT: .LBB3_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: neg a6, a5
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a6, a3, a6
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a5, .LBB3_4
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: bltu a5, a4, .LBB3_7
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: andi a5, a6, 63
+; RV32I-NEXT: bgeu a5, a4, .LBB3_8
+; RV32I-NEXT: .LBB3_6:
+; RV32I-NEXT: sll a4, a0, a6
+; RV32I-NEXT: neg a7, a5
+; RV32I-NEXT: srl a0, a0, a7
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: bnez a5, .LBB3_9
+; RV32I-NEXT: j .LBB3_10
+; RV32I-NEXT: .LBB3_7:
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: andi a5, a6, 63
+; RV32I-NEXT: bltu a5, a4, .LBB3_6
+; RV32I-NEXT: .LBB3_8:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: sll a0, a0, a5
+; RV32I-NEXT: beqz a5, .LBB3_10
+; RV32I-NEXT: .LBB3_9:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB3_10:
+; RV32I-NEXT: or a0, a3, a4
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srl a1, a0, a1
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a5, a2, 63
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: bltu a5, a4, .LBB3_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a6, a1, a5
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: bnez a5, .LBB3_3
+; RV32ZBB-NEXT: j .LBB3_4
+; RV32ZBB-NEXT: .LBB3_2:
+; RV32ZBB-NEXT: srl a3, a0, a2
+; RV32ZBB-NEXT: neg a6, a5
+; RV32ZBB-NEXT: sll a6, a1, a6
+; RV32ZBB-NEXT: or a6, a3, a6
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: beqz a5, .LBB3_4
+; RV32ZBB-NEXT: .LBB3_3:
+; RV32ZBB-NEXT: mv a3, a6
+; RV32ZBB-NEXT: .LBB3_4:
+; RV32ZBB-NEXT: neg a6, a2
+; RV32ZBB-NEXT: bltu a5, a4, .LBB3_7
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li a2, 0
+; RV32ZBB-NEXT: andi a5, a6, 63
+; RV32ZBB-NEXT: bgeu a5, a4, .LBB3_8
+; RV32ZBB-NEXT: .LBB3_6:
+; RV32ZBB-NEXT: sll a4, a0, a6
+; RV32ZBB-NEXT: neg a7, a5
+; RV32ZBB-NEXT: srl a0, a0, a7
+; RV32ZBB-NEXT: sll a6, a1, a6
+; RV32ZBB-NEXT: or a0, a0, a6
+; RV32ZBB-NEXT: bnez a5, .LBB3_9
+; RV32ZBB-NEXT: j .LBB3_10
+; RV32ZBB-NEXT: .LBB3_7:
+; RV32ZBB-NEXT: srl a2, a1, a2
+; RV32ZBB-NEXT: andi a5, a6, 63
+; RV32ZBB-NEXT: bltu a5, a4, .LBB3_6
+; RV32ZBB-NEXT: .LBB3_8:
+; RV32ZBB-NEXT: li a4, 0
+; RV32ZBB-NEXT: sll a0, a0, a5
+; RV32ZBB-NEXT: beqz a5, .LBB3_10
+; RV32ZBB-NEXT: .LBB3_9:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB3_10:
+; RV32ZBB-NEXT: or a0, a3, a4
+; RV32ZBB-NEXT: or a1, a2, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ror a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a5, a2, 63
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a6, a1, a5
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: bnez a5, .LBB3_3
+; RV32XTHEADBB-NEXT: j .LBB3_4
+; RV32XTHEADBB-NEXT: .LBB3_2:
+; RV32XTHEADBB-NEXT: srl a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a6, a5
+; RV32XTHEADBB-NEXT: sll a6, a1, a6
+; RV32XTHEADBB-NEXT: or a6, a3, a6
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: beqz a5, .LBB3_4
+; RV32XTHEADBB-NEXT: .LBB3_3:
+; RV32XTHEADBB-NEXT: mv a3, a6
+; RV32XTHEADBB-NEXT: .LBB3_4:
+; RV32XTHEADBB-NEXT: neg a6, a2
+; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_7
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li a2, 0
+; RV32XTHEADBB-NEXT: andi a5, a6, 63
+; RV32XTHEADBB-NEXT: bgeu a5, a4, .LBB3_8
+; RV32XTHEADBB-NEXT: .LBB3_6:
+; RV32XTHEADBB-NEXT: sll a4, a0, a6
+; RV32XTHEADBB-NEXT: neg a7, a5
+; RV32XTHEADBB-NEXT: srl a0, a0, a7
+; RV32XTHEADBB-NEXT: sll a6, a1, a6
+; RV32XTHEADBB-NEXT: or a0, a0, a6
+; RV32XTHEADBB-NEXT: bnez a5, .LBB3_9
+; RV32XTHEADBB-NEXT: j .LBB3_10
+; RV32XTHEADBB-NEXT: .LBB3_7:
+; RV32XTHEADBB-NEXT: srl a2, a1, a2
+; RV32XTHEADBB-NEXT: andi a5, a6, 63
+; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_6
+; RV32XTHEADBB-NEXT: .LBB3_8:
+; RV32XTHEADBB-NEXT: li a4, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, a5
+; RV32XTHEADBB-NEXT: beqz a5, .LBB3_10
+; RV32XTHEADBB-NEXT: .LBB3_9:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB3_10:
+; RV32XTHEADBB-NEXT: or a0, a3, a4
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: srl a1, a0, a1
+; RV64XTHEADBB-NEXT: sll a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i64 64, %y
+ %b = lshr i64 %x, %y
+ %c = shl i64 %x, %z
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_32_mask:
+; RV32I: # %bb.0:
+; RV32I-NEXT: neg a2, a1
+; RV32I-NEXT: sll a1, a0, a1
+; RV32I-NEXT: srl a0, a0, a2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: sllw a1, a0, a1
+; RV64I-NEXT: srlw a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: neg a2, a1
+; RV32ZBB-NEXT: sll a1, a0, a1
+; RV32ZBB-NEXT: srl a0, a0, a2
+; RV32ZBB-NEXT: or a0, a1, a0
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: sllw a1, a0, a1
+; RV64ZBB-NEXT: srlw a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: neg a2, a1
+; RV32XTHEADBB-NEXT: sll a1, a0, a1
+; RV32XTHEADBB-NEXT: srl a0, a0, a2
+; RV32XTHEADBB-NEXT: or a0, a1, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: sllw a1, a0, a1
+; RV64XTHEADBB-NEXT: srlw a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 0, %y
+ %and = and i32 %z, 31
+ %b = shl i32 %x, %y
+ %c = lshr i32 %x, %and
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_32_mask_and_63_and_31:
+; RV32I: # %bb.0:
+; RV32I-NEXT: sll a2, a0, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask_and_63_and_31:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sllw a2, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: srlw a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask_and_63_and_31:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: sll a2, a0, a1
+; RV32ZBB-NEXT: neg a1, a1
+; RV32ZBB-NEXT: srl a0, a0, a1
+; RV32ZBB-NEXT: or a0, a2, a0
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: sllw a2, a0, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: srlw a0, a0, a1
+; RV64ZBB-NEXT: or a0, a2, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: sll a2, a0, a1
+; RV32XTHEADBB-NEXT: neg a1, a1
+; RV32XTHEADBB-NEXT: srl a0, a0, a1
+; RV32XTHEADBB-NEXT: or a0, a2, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: sllw a2, a0, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: srlw a0, a0, a1
+; RV64XTHEADBB-NEXT: or a0, a2, a0
+; RV64XTHEADBB-NEXT: ret
+ %a = and i32 %y, 63
+ %b = shl i32 %x, %a
+ %c = sub i32 0, %y
+ %d = and i32 %c, 31
+ %e = lshr i32 %x, %d
+ %f = or i32 %b, %e
+ ret i32 %f
+}
+
+define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_32_mask_or_64_or_32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: ori a1, a1, 64
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask_or_64_or_32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: ori a1, a1, 64
+; RV64I-NEXT: sllw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask_or_64_or_32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ori a1, a1, 64
+; RV32ZBB-NEXT: sll a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask_or_64_or_32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ori a1, a1, 64
+; RV64ZBB-NEXT: sllw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask_or_64_or_32:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: ori a1, a1, 64
+; RV32XTHEADBB-NEXT: sll a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask_or_64_or_32:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: ori a1, a1, 64
+; RV64XTHEADBB-NEXT: sllw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %a = or i32 %y, 64
+ %b = shl i32 %x, %a
+ %c = sub i32 0, %y
+ %d = or i32 %c, 32
+ %e = lshr i32 %x, %d
+ %f = or i32 %b, %e
+ ret i32 %f
+}
+
+define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_32_mask:
+; RV32I: # %bb.0:
+; RV32I-NEXT: neg a2, a1
+; RV32I-NEXT: srl a1, a0, a1
+; RV32I-NEXT: sll a0, a0, a2
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srlw a1, a0, a1
+; RV64I-NEXT: sllw a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: neg a2, a1
+; RV32ZBB-NEXT: srl a1, a0, a1
+; RV32ZBB-NEXT: sll a0, a0, a2
+; RV32ZBB-NEXT: or a0, a1, a0
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: srlw a1, a0, a1
+; RV64ZBB-NEXT: sllw a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: neg a2, a1
+; RV32XTHEADBB-NEXT: srl a1, a0, a1
+; RV32XTHEADBB-NEXT: sll a0, a0, a2
+; RV32XTHEADBB-NEXT: or a0, a1, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: srlw a1, a0, a1
+; RV64XTHEADBB-NEXT: sllw a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 0, %y
+ %and = and i32 %z, 31
+ %b = lshr i32 %x, %y
+ %c = shl i32 %x, %and
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_32_mask_and_63_and_31:
+; RV32I: # %bb.0:
+; RV32I-NEXT: srl a2, a0, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask_and_63_and_31:
+; RV64I: # %bb.0:
+; RV64I-NEXT: srlw a2, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: sllw a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask_and_63_and_31:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: srl a2, a0, a1
+; RV32ZBB-NEXT: neg a1, a1
+; RV32ZBB-NEXT: sll a0, a0, a1
+; RV32ZBB-NEXT: or a0, a2, a0
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: srlw a2, a0, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: sllw a0, a0, a1
+; RV64ZBB-NEXT: or a0, a2, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: srl a2, a0, a1
+; RV32XTHEADBB-NEXT: neg a1, a1
+; RV32XTHEADBB-NEXT: sll a0, a0, a1
+; RV32XTHEADBB-NEXT: or a0, a2, a0
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: srlw a2, a0, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: sllw a0, a0, a1
+; RV64XTHEADBB-NEXT: or a0, a2, a0
+; RV64XTHEADBB-NEXT: ret
+ %a = and i32 %y, 63
+ %b = lshr i32 %x, %a
+ %c = sub i32 0, %y
+ %d = and i32 %c, 31
+ %e = shl i32 %x, %d
+ %f = or i32 %b, %e
+ ret i32 %f
+}
+
+define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_32_mask_or_64_or_32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: ori a1, a1, 64
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask_or_64_or_32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: ori a1, a1, 64
+; RV64I-NEXT: srlw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask_or_64_or_32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ori a1, a1, 64
+; RV32ZBB-NEXT: srl a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask_or_64_or_32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ori a1, a1, 64
+; RV64ZBB-NEXT: srlw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask_or_64_or_32:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: ori a1, a1, 64
+; RV32XTHEADBB-NEXT: srl a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask_or_64_or_32:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: ori a1, a1, 64
+; RV64XTHEADBB-NEXT: srlw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %a = or i32 %y, 64
+ %b = lshr i32 %x, %a
+ %c = sub i32 0, %y
+ %d = or i32 %c, 32
+ %e = shl i32 %x, %d
+ %f = or i32 %b, %e
+ ret i32 %f
+}
+
+define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotl_64_mask:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a5, 32
+; RV32I-NEXT: neg a4, a2
+; RV32I-NEXT: bltu a2, a5, .LBB10_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: sll t0, a0, a2
+; RV32I-NEXT: j .LBB10_3
+; RV32I-NEXT: .LBB10_2:
+; RV32I-NEXT: sll a3, a0, a2
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: srl a6, a0, a6
+; RV32I-NEXT: sll a7, a1, a2
+; RV32I-NEXT: or t0, a6, a7
+; RV32I-NEXT: .LBB10_3:
+; RV32I-NEXT: andi a7, a4, 63
+; RV32I-NEXT: mv a6, a1
+; RV32I-NEXT: beqz a2, .LBB10_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a6, t0
+; RV32I-NEXT: .LBB10_5:
+; RV32I-NEXT: bltu a7, a5, .LBB10_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl a2, a1, a7
+; RV32I-NEXT: bnez a7, .LBB10_8
+; RV32I-NEXT: j .LBB10_9
+; RV32I-NEXT: .LBB10_7:
+; RV32I-NEXT: srl a2, a0, a4
+; RV32I-NEXT: neg t0, a7
+; RV32I-NEXT: sll t0, a1, t0
+; RV32I-NEXT: or a2, a2, t0
+; RV32I-NEXT: beqz a7, .LBB10_9
+; RV32I-NEXT: .LBB10_8:
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: .LBB10_9:
+; RV32I-NEXT: bltu a7, a5, .LBB10_11
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB10_12
+; RV32I-NEXT: .LBB10_11:
+; RV32I-NEXT: srl a1, a1, a4
+; RV32I-NEXT: .LBB10_12:
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: sll a1, a0, a1
+; RV64I-NEXT: srl a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: li a5, 32
+; RV32ZBB-NEXT: neg a4, a2
+; RV32ZBB-NEXT: bltu a2, a5, .LBB10_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: sll t0, a0, a2
+; RV32ZBB-NEXT: j .LBB10_3
+; RV32ZBB-NEXT: .LBB10_2:
+; RV32ZBB-NEXT: sll a3, a0, a2
+; RV32ZBB-NEXT: neg a6, a2
+; RV32ZBB-NEXT: srl a6, a0, a6
+; RV32ZBB-NEXT: sll a7, a1, a2
+; RV32ZBB-NEXT: or t0, a6, a7
+; RV32ZBB-NEXT: .LBB10_3:
+; RV32ZBB-NEXT: andi a7, a4, 63
+; RV32ZBB-NEXT: mv a6, a1
+; RV32ZBB-NEXT: beqz a2, .LBB10_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a6, t0
+; RV32ZBB-NEXT: .LBB10_5:
+; RV32ZBB-NEXT: bltu a7, a5, .LBB10_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl a2, a1, a7
+; RV32ZBB-NEXT: bnez a7, .LBB10_8
+; RV32ZBB-NEXT: j .LBB10_9
+; RV32ZBB-NEXT: .LBB10_7:
+; RV32ZBB-NEXT: srl a2, a0, a4
+; RV32ZBB-NEXT: neg t0, a7
+; RV32ZBB-NEXT: sll t0, a1, t0
+; RV32ZBB-NEXT: or a2, a2, t0
+; RV32ZBB-NEXT: beqz a7, .LBB10_9
+; RV32ZBB-NEXT: .LBB10_8:
+; RV32ZBB-NEXT: mv a0, a2
+; RV32ZBB-NEXT: .LBB10_9:
+; RV32ZBB-NEXT: bltu a7, a5, .LBB10_11
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: j .LBB10_12
+; RV32ZBB-NEXT: .LBB10_11:
+; RV32ZBB-NEXT: srl a1, a1, a4
+; RV32ZBB-NEXT: .LBB10_12:
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: or a1, a6, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: sll a1, a0, a1
+; RV64ZBB-NEXT: srl a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: li a5, 32
+; RV32XTHEADBB-NEXT: neg a4, a2
+; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB10_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: sll t0, a0, a2
+; RV32XTHEADBB-NEXT: j .LBB10_3
+; RV32XTHEADBB-NEXT: .LBB10_2:
+; RV32XTHEADBB-NEXT: sll a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a6, a2
+; RV32XTHEADBB-NEXT: srl a6, a0, a6
+; RV32XTHEADBB-NEXT: sll a7, a1, a2
+; RV32XTHEADBB-NEXT: or t0, a6, a7
+; RV32XTHEADBB-NEXT: .LBB10_3:
+; RV32XTHEADBB-NEXT: andi a7, a4, 63
+; RV32XTHEADBB-NEXT: mv a6, a1
+; RV32XTHEADBB-NEXT: beqz a2, .LBB10_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a6, t0
+; RV32XTHEADBB-NEXT: .LBB10_5:
+; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB10_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl a2, a1, a7
+; RV32XTHEADBB-NEXT: bnez a7, .LBB10_8
+; RV32XTHEADBB-NEXT: j .LBB10_9
+; RV32XTHEADBB-NEXT: .LBB10_7:
+; RV32XTHEADBB-NEXT: srl a2, a0, a4
+; RV32XTHEADBB-NEXT: neg t0, a7
+; RV32XTHEADBB-NEXT: sll t0, a1, t0
+; RV32XTHEADBB-NEXT: or a2, a2, t0
+; RV32XTHEADBB-NEXT: beqz a7, .LBB10_9
+; RV32XTHEADBB-NEXT: .LBB10_8:
+; RV32XTHEADBB-NEXT: mv a0, a2
+; RV32XTHEADBB-NEXT: .LBB10_9:
+; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB10_11
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: j .LBB10_12
+; RV32XTHEADBB-NEXT: .LBB10_11:
+; RV32XTHEADBB-NEXT: srl a1, a1, a4
+; RV32XTHEADBB-NEXT: .LBB10_12:
+; RV32XTHEADBB-NEXT: or a0, a3, a0
+; RV32XTHEADBB-NEXT: or a1, a6, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: sll a1, a0, a1
+; RV64XTHEADBB-NEXT: srl a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i64 0, %y
+ %and = and i64 %z, 63
+ %b = shl i64 %x, %y
+ %c = lshr i64 %x, %and
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotl_64_mask_and_127_and_63:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a6, a2, 127
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a6, a4, .LBB11_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: sll a7, a0, a6
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: bnez a6, .LBB11_3
+; RV32I-NEXT: j .LBB11_4
+; RV32I-NEXT: .LBB11_2:
+; RV32I-NEXT: sll a3, a0, a2
+; RV32I-NEXT: neg a5, a6
+; RV32I-NEXT: srl a5, a0, a5
+; RV32I-NEXT: sll a7, a1, a2
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: beqz a6, .LBB11_4
+; RV32I-NEXT: .LBB11_3:
+; RV32I-NEXT: mv a5, a7
+; RV32I-NEXT: .LBB11_4:
+; RV32I-NEXT: neg a2, a2
+; RV32I-NEXT: andi a6, a2, 63
+; RV32I-NEXT: bltu a6, a4, .LBB11_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srl a7, a1, a6
+; RV32I-NEXT: bnez a6, .LBB11_7
+; RV32I-NEXT: j .LBB11_8
+; RV32I-NEXT: .LBB11_6:
+; RV32I-NEXT: srl a7, a0, a2
+; RV32I-NEXT: neg t0, a6
+; RV32I-NEXT: sll t0, a1, t0
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: beqz a6, .LBB11_8
+; RV32I-NEXT: .LBB11_7:
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: .LBB11_8:
+; RV32I-NEXT: bltu a6, a4, .LBB11_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB11_11
+; RV32I-NEXT: .LBB11_10:
+; RV32I-NEXT: srl a1, a1, a2
+; RV32I-NEXT: .LBB11_11:
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a1, a5, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask_and_127_and_63:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sll a2, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask_and_127_and_63:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a6, a2, 127
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: bltu a6, a4, .LBB11_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: sll a7, a0, a6
+; RV32ZBB-NEXT: mv a5, a1
+; RV32ZBB-NEXT: bnez a6, .LBB11_3
+; RV32ZBB-NEXT: j .LBB11_4
+; RV32ZBB-NEXT: .LBB11_2:
+; RV32ZBB-NEXT: sll a3, a0, a2
+; RV32ZBB-NEXT: neg a5, a6
+; RV32ZBB-NEXT: srl a5, a0, a5
+; RV32ZBB-NEXT: sll a7, a1, a2
+; RV32ZBB-NEXT: or a7, a5, a7
+; RV32ZBB-NEXT: mv a5, a1
+; RV32ZBB-NEXT: beqz a6, .LBB11_4
+; RV32ZBB-NEXT: .LBB11_3:
+; RV32ZBB-NEXT: mv a5, a7
+; RV32ZBB-NEXT: .LBB11_4:
+; RV32ZBB-NEXT: neg a2, a2
+; RV32ZBB-NEXT: andi a6, a2, 63
+; RV32ZBB-NEXT: bltu a6, a4, .LBB11_6
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: srl a7, a1, a6
+; RV32ZBB-NEXT: bnez a6, .LBB11_7
+; RV32ZBB-NEXT: j .LBB11_8
+; RV32ZBB-NEXT: .LBB11_6:
+; RV32ZBB-NEXT: srl a7, a0, a2
+; RV32ZBB-NEXT: neg t0, a6
+; RV32ZBB-NEXT: sll t0, a1, t0
+; RV32ZBB-NEXT: or a7, a7, t0
+; RV32ZBB-NEXT: beqz a6, .LBB11_8
+; RV32ZBB-NEXT: .LBB11_7:
+; RV32ZBB-NEXT: mv a0, a7
+; RV32ZBB-NEXT: .LBB11_8:
+; RV32ZBB-NEXT: bltu a6, a4, .LBB11_10
+; RV32ZBB-NEXT: # %bb.9:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: j .LBB11_11
+; RV32ZBB-NEXT: .LBB11_10:
+; RV32ZBB-NEXT: srl a1, a1, a2
+; RV32ZBB-NEXT: .LBB11_11:
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: or a1, a5, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: sll a2, a0, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: srl a0, a0, a1
+; RV64ZBB-NEXT: or a0, a2, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a6, a2, 127
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: sll a7, a0, a6
+; RV32XTHEADBB-NEXT: mv a5, a1
+; RV32XTHEADBB-NEXT: bnez a6, .LBB11_3
+; RV32XTHEADBB-NEXT: j .LBB11_4
+; RV32XTHEADBB-NEXT: .LBB11_2:
+; RV32XTHEADBB-NEXT: sll a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a5, a6
+; RV32XTHEADBB-NEXT: srl a5, a0, a5
+; RV32XTHEADBB-NEXT: sll a7, a1, a2
+; RV32XTHEADBB-NEXT: or a7, a5, a7
+; RV32XTHEADBB-NEXT: mv a5, a1
+; RV32XTHEADBB-NEXT: beqz a6, .LBB11_4
+; RV32XTHEADBB-NEXT: .LBB11_3:
+; RV32XTHEADBB-NEXT: mv a5, a7
+; RV32XTHEADBB-NEXT: .LBB11_4:
+; RV32XTHEADBB-NEXT: neg a2, a2
+; RV32XTHEADBB-NEXT: andi a6, a2, 63
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_6
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: srl a7, a1, a6
+; RV32XTHEADBB-NEXT: bnez a6, .LBB11_7
+; RV32XTHEADBB-NEXT: j .LBB11_8
+; RV32XTHEADBB-NEXT: .LBB11_6:
+; RV32XTHEADBB-NEXT: srl a7, a0, a2
+; RV32XTHEADBB-NEXT: neg t0, a6
+; RV32XTHEADBB-NEXT: sll t0, a1, t0
+; RV32XTHEADBB-NEXT: or a7, a7, t0
+; RV32XTHEADBB-NEXT: beqz a6, .LBB11_8
+; RV32XTHEADBB-NEXT: .LBB11_7:
+; RV32XTHEADBB-NEXT: mv a0, a7
+; RV32XTHEADBB-NEXT: .LBB11_8:
+; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_10
+; RV32XTHEADBB-NEXT: # %bb.9:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: j .LBB11_11
+; RV32XTHEADBB-NEXT: .LBB11_10:
+; RV32XTHEADBB-NEXT: srl a1, a1, a2
+; RV32XTHEADBB-NEXT: .LBB11_11:
+; RV32XTHEADBB-NEXT: or a0, a3, a0
+; RV32XTHEADBB-NEXT: or a1, a5, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: sll a2, a0, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: srl a0, a0, a1
+; RV64XTHEADBB-NEXT: or a0, a2, a0
+; RV64XTHEADBB-NEXT: ret
+ %a = and i64 %y, 127
+ %b = shl i64 %x, %a
+ %c = sub i64 0, %y
+ %d = and i64 %c, 63
+ %e = lshr i64 %x, %d
+ %f = or i64 %b, %e
+ ret i64 %f
+}
+
+define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotl_64_mask_or_128_or_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: ori a2, a2, 128
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: bltu a2, a0, .LBB12_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: bnez a2, .LBB12_3
+; RV32I-NEXT: j .LBB12_4
+; RV32I-NEXT: .LBB12_2:
+; RV32I-NEXT: sll a0, a3, a2
+; RV32I-NEXT: neg a4, a2
+; RV32I-NEXT: srl a3, a3, a4
+; RV32I-NEXT: sll a4, a1, a2
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: beqz a2, .LBB12_4
+; RV32I-NEXT: .LBB12_3:
+; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: .LBB12_4:
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask_or_128_or_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: ori a1, a1, 128
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask_or_128_or_64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: ori a2, a2, 128
+; RV32ZBB-NEXT: li a0, 32
+; RV32ZBB-NEXT: bltu a2, a0, .LBB12_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a0, 0
+; RV32ZBB-NEXT: sll a3, a3, a2
+; RV32ZBB-NEXT: bnez a2, .LBB12_3
+; RV32ZBB-NEXT: j .LBB12_4
+; RV32ZBB-NEXT: .LBB12_2:
+; RV32ZBB-NEXT: sll a0, a3, a2
+; RV32ZBB-NEXT: neg a4, a2
+; RV32ZBB-NEXT: srl a3, a3, a4
+; RV32ZBB-NEXT: sll a4, a1, a2
+; RV32ZBB-NEXT: or a3, a3, a4
+; RV32ZBB-NEXT: beqz a2, .LBB12_4
+; RV32ZBB-NEXT: .LBB12_3:
+; RV32ZBB-NEXT: mv a1, a3
+; RV32ZBB-NEXT: .LBB12_4:
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask_or_128_or_64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ori a1, a1, 128
+; RV64ZBB-NEXT: sll a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask_or_128_or_64:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: ori a2, a2, 128
+; RV32XTHEADBB-NEXT: li a0, 32
+; RV32XTHEADBB-NEXT: bltu a2, a0, .LBB12_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a0, 0
+; RV32XTHEADBB-NEXT: sll a3, a3, a2
+; RV32XTHEADBB-NEXT: bnez a2, .LBB12_3
+; RV32XTHEADBB-NEXT: j .LBB12_4
+; RV32XTHEADBB-NEXT: .LBB12_2:
+; RV32XTHEADBB-NEXT: sll a0, a3, a2
+; RV32XTHEADBB-NEXT: neg a4, a2
+; RV32XTHEADBB-NEXT: srl a3, a3, a4
+; RV32XTHEADBB-NEXT: sll a4, a1, a2
+; RV32XTHEADBB-NEXT: or a3, a3, a4
+; RV32XTHEADBB-NEXT: beqz a2, .LBB12_4
+; RV32XTHEADBB-NEXT: .LBB12_3:
+; RV32XTHEADBB-NEXT: mv a1, a3
+; RV32XTHEADBB-NEXT: .LBB12_4:
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask_or_128_or_64:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: ori a1, a1, 128
+; RV64XTHEADBB-NEXT: sll a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %a = or i64 %y, 128
+ %b = shl i64 %x, %a
+ %c = sub i64 0, %y
+ %d = or i64 %c, 64
+ %e = lshr i64 %x, %d
+ %f = or i64 %b, %e
+ ret i64 %f
+}
+
+define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotr_64_mask:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a2, a4, .LBB13_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a5, a1, a2
+; RV32I-NEXT: j .LBB13_3
+; RV32I-NEXT: .LBB13_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a5, a3, a5
+; RV32I-NEXT: .LBB13_3:
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a2, .LBB13_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: .LBB13_5:
+; RV32I-NEXT: andi a5, a6, 63
+; RV32I-NEXT: bltu a2, a4, .LBB13_8
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: bgeu a5, a4, .LBB13_9
+; RV32I-NEXT: .LBB13_7:
+; RV32I-NEXT: sll a4, a0, a6
+; RV32I-NEXT: neg a7, a5
+; RV32I-NEXT: srl a0, a0, a7
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: bnez a5, .LBB13_10
+; RV32I-NEXT: j .LBB13_11
+; RV32I-NEXT: .LBB13_8:
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: bltu a5, a4, .LBB13_7
+; RV32I-NEXT: .LBB13_9:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: sll a0, a0, a5
+; RV32I-NEXT: beqz a5, .LBB13_11
+; RV32I-NEXT: .LBB13_10:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB13_11:
+; RV32I-NEXT: or a0, a3, a4
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srl a1, a0, a1
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: bltu a2, a4, .LBB13_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a5, a1, a2
+; RV32ZBB-NEXT: j .LBB13_3
+; RV32ZBB-NEXT: .LBB13_2:
+; RV32ZBB-NEXT: srl a3, a0, a2
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: sll a5, a1, a5
+; RV32ZBB-NEXT: or a5, a3, a5
+; RV32ZBB-NEXT: .LBB13_3:
+; RV32ZBB-NEXT: neg a6, a2
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: beqz a2, .LBB13_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a3, a5
+; RV32ZBB-NEXT: .LBB13_5:
+; RV32ZBB-NEXT: andi a5, a6, 63
+; RV32ZBB-NEXT: bltu a2, a4, .LBB13_8
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: li a2, 0
+; RV32ZBB-NEXT: bgeu a5, a4, .LBB13_9
+; RV32ZBB-NEXT: .LBB13_7:
+; RV32ZBB-NEXT: sll a4, a0, a6
+; RV32ZBB-NEXT: neg a7, a5
+; RV32ZBB-NEXT: srl a0, a0, a7
+; RV32ZBB-NEXT: sll a6, a1, a6
+; RV32ZBB-NEXT: or a0, a0, a6
+; RV32ZBB-NEXT: bnez a5, .LBB13_10
+; RV32ZBB-NEXT: j .LBB13_11
+; RV32ZBB-NEXT: .LBB13_8:
+; RV32ZBB-NEXT: srl a2, a1, a2
+; RV32ZBB-NEXT: bltu a5, a4, .LBB13_7
+; RV32ZBB-NEXT: .LBB13_9:
+; RV32ZBB-NEXT: li a4, 0
+; RV32ZBB-NEXT: sll a0, a0, a5
+; RV32ZBB-NEXT: beqz a5, .LBB13_11
+; RV32ZBB-NEXT: .LBB13_10:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB13_11:
+; RV32ZBB-NEXT: or a0, a3, a4
+; RV32ZBB-NEXT: or a1, a2, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: srl a1, a0, a1
+; RV64ZBB-NEXT: sll a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB13_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a5, a1, a2
+; RV32XTHEADBB-NEXT: j .LBB13_3
+; RV32XTHEADBB-NEXT: .LBB13_2:
+; RV32XTHEADBB-NEXT: srl a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a5, a2
+; RV32XTHEADBB-NEXT: sll a5, a1, a5
+; RV32XTHEADBB-NEXT: or a5, a3, a5
+; RV32XTHEADBB-NEXT: .LBB13_3:
+; RV32XTHEADBB-NEXT: neg a6, a2
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: beqz a2, .LBB13_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a3, a5
+; RV32XTHEADBB-NEXT: .LBB13_5:
+; RV32XTHEADBB-NEXT: andi a5, a6, 63
+; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB13_8
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: li a2, 0
+; RV32XTHEADBB-NEXT: bgeu a5, a4, .LBB13_9
+; RV32XTHEADBB-NEXT: .LBB13_7:
+; RV32XTHEADBB-NEXT: sll a4, a0, a6
+; RV32XTHEADBB-NEXT: neg a7, a5
+; RV32XTHEADBB-NEXT: srl a0, a0, a7
+; RV32XTHEADBB-NEXT: sll a6, a1, a6
+; RV32XTHEADBB-NEXT: or a0, a0, a6
+; RV32XTHEADBB-NEXT: bnez a5, .LBB13_10
+; RV32XTHEADBB-NEXT: j .LBB13_11
+; RV32XTHEADBB-NEXT: .LBB13_8:
+; RV32XTHEADBB-NEXT: srl a2, a1, a2
+; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB13_7
+; RV32XTHEADBB-NEXT: .LBB13_9:
+; RV32XTHEADBB-NEXT: li a4, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, a5
+; RV32XTHEADBB-NEXT: beqz a5, .LBB13_11
+; RV32XTHEADBB-NEXT: .LBB13_10:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB13_11:
+; RV32XTHEADBB-NEXT: or a0, a3, a4
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: srl a1, a0, a1
+; RV64XTHEADBB-NEXT: sll a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i64 0, %y
+ %and = and i64 %z, 63
+ %b = lshr i64 %x, %y
+ %c = shl i64 %x, %and
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotr_64_mask_and_127_and_63:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a4, a2, 127
+; RV32I-NEXT: li a5, 32
+; RV32I-NEXT: bltu a4, a5, .LBB14_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a6, a1, a4
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: bnez a4, .LBB14_3
+; RV32I-NEXT: j .LBB14_4
+; RV32I-NEXT: .LBB14_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: neg a6, a4
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a6, a3, a6
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a4, .LBB14_4
+; RV32I-NEXT: .LBB14_3:
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: .LBB14_4:
+; RV32I-NEXT: bltu a4, a5, .LBB14_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: j .LBB14_7
+; RV32I-NEXT: .LBB14_6:
+; RV32I-NEXT: srl a4, a1, a2
+; RV32I-NEXT: .LBB14_7:
+; RV32I-NEXT: neg a7, a2
+; RV32I-NEXT: andi a6, a7, 63
+; RV32I-NEXT: bltu a6, a5, .LBB14_9
+; RV32I-NEXT: # %bb.8:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: sll a0, a0, a6
+; RV32I-NEXT: bnez a6, .LBB14_10
+; RV32I-NEXT: j .LBB14_11
+; RV32I-NEXT: .LBB14_9:
+; RV32I-NEXT: sll a2, a0, a7
+; RV32I-NEXT: neg a5, a6
+; RV32I-NEXT: srl a0, a0, a5
+; RV32I-NEXT: sll a5, a1, a7
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: beqz a6, .LBB14_11
+; RV32I-NEXT: .LBB14_10:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB14_11:
+; RV32I-NEXT: or a0, a3, a2
+; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask_and_127_and_63:
+; RV64I: # %bb.0:
+; RV64I-NEXT: srl a2, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask_and_127_and_63:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a4, a2, 127
+; RV32ZBB-NEXT: li a5, 32
+; RV32ZBB-NEXT: bltu a4, a5, .LBB14_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a6, a1, a4
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: bnez a4, .LBB14_3
+; RV32ZBB-NEXT: j .LBB14_4
+; RV32ZBB-NEXT: .LBB14_2:
+; RV32ZBB-NEXT: srl a3, a0, a2
+; RV32ZBB-NEXT: neg a6, a4
+; RV32ZBB-NEXT: sll a6, a1, a6
+; RV32ZBB-NEXT: or a6, a3, a6
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: beqz a4, .LBB14_4
+; RV32ZBB-NEXT: .LBB14_3:
+; RV32ZBB-NEXT: mv a3, a6
+; RV32ZBB-NEXT: .LBB14_4:
+; RV32ZBB-NEXT: bltu a4, a5, .LBB14_6
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li a4, 0
+; RV32ZBB-NEXT: j .LBB14_7
+; RV32ZBB-NEXT: .LBB14_6:
+; RV32ZBB-NEXT: srl a4, a1, a2
+; RV32ZBB-NEXT: .LBB14_7:
+; RV32ZBB-NEXT: neg a7, a2
+; RV32ZBB-NEXT: andi a6, a7, 63
+; RV32ZBB-NEXT: bltu a6, a5, .LBB14_9
+; RV32ZBB-NEXT: # %bb.8:
+; RV32ZBB-NEXT: li a2, 0
+; RV32ZBB-NEXT: sll a0, a0, a6
+; RV32ZBB-NEXT: bnez a6, .LBB14_10
+; RV32ZBB-NEXT: j .LBB14_11
+; RV32ZBB-NEXT: .LBB14_9:
+; RV32ZBB-NEXT: sll a2, a0, a7
+; RV32ZBB-NEXT: neg a5, a6
+; RV32ZBB-NEXT: srl a0, a0, a5
+; RV32ZBB-NEXT: sll a5, a1, a7
+; RV32ZBB-NEXT: or a0, a0, a5
+; RV32ZBB-NEXT: beqz a6, .LBB14_11
+; RV32ZBB-NEXT: .LBB14_10:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB14_11:
+; RV32ZBB-NEXT: or a0, a3, a2
+; RV32ZBB-NEXT: or a1, a4, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: srl a2, a0, a1
+; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: sll a0, a0, a1
+; RV64ZBB-NEXT: or a0, a2, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a4, a2, 127
+; RV32XTHEADBB-NEXT: li a5, 32
+; RV32XTHEADBB-NEXT: bltu a4, a5, .LBB14_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a6, a1, a4
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: bnez a4, .LBB14_3
+; RV32XTHEADBB-NEXT: j .LBB14_4
+; RV32XTHEADBB-NEXT: .LBB14_2:
+; RV32XTHEADBB-NEXT: srl a3, a0, a2
+; RV32XTHEADBB-NEXT: neg a6, a4
+; RV32XTHEADBB-NEXT: sll a6, a1, a6
+; RV32XTHEADBB-NEXT: or a6, a3, a6
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: beqz a4, .LBB14_4
+; RV32XTHEADBB-NEXT: .LBB14_3:
+; RV32XTHEADBB-NEXT: mv a3, a6
+; RV32XTHEADBB-NEXT: .LBB14_4:
+; RV32XTHEADBB-NEXT: bltu a4, a5, .LBB14_6
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li a4, 0
+; RV32XTHEADBB-NEXT: j .LBB14_7
+; RV32XTHEADBB-NEXT: .LBB14_6:
+; RV32XTHEADBB-NEXT: srl a4, a1, a2
+; RV32XTHEADBB-NEXT: .LBB14_7:
+; RV32XTHEADBB-NEXT: neg a7, a2
+; RV32XTHEADBB-NEXT: andi a6, a7, 63
+; RV32XTHEADBB-NEXT: bltu a6, a5, .LBB14_9
+; RV32XTHEADBB-NEXT: # %bb.8:
+; RV32XTHEADBB-NEXT: li a2, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, a6
+; RV32XTHEADBB-NEXT: bnez a6, .LBB14_10
+; RV32XTHEADBB-NEXT: j .LBB14_11
+; RV32XTHEADBB-NEXT: .LBB14_9:
+; RV32XTHEADBB-NEXT: sll a2, a0, a7
+; RV32XTHEADBB-NEXT: neg a5, a6
+; RV32XTHEADBB-NEXT: srl a0, a0, a5
+; RV32XTHEADBB-NEXT: sll a5, a1, a7
+; RV32XTHEADBB-NEXT: or a0, a0, a5
+; RV32XTHEADBB-NEXT: beqz a6, .LBB14_11
+; RV32XTHEADBB-NEXT: .LBB14_10:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB14_11:
+; RV32XTHEADBB-NEXT: or a0, a3, a2
+; RV32XTHEADBB-NEXT: or a1, a4, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: srl a2, a0, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: sll a0, a0, a1
+; RV64XTHEADBB-NEXT: or a0, a2, a0
+; RV64XTHEADBB-NEXT: ret
+ %a = and i64 %y, 127
+ %b = lshr i64 %x, %a
+ %c = sub i64 0, %y
+ %d = and i64 %c, 63
+ %e = shl i64 %x, %d
+ %f = or i64 %b, %e
+ ret i64 %f
+}
+
+define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: rotr_64_mask_or_128_or_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: ori a2, a2, 128
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB15_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB15_3
+; RV32I-NEXT: j .LBB15_4
+; RV32I-NEXT: .LBB15_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB15_4
+; RV32I-NEXT: .LBB15_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB15_4:
+; RV32I-NEXT: bltu a2, a3, .LBB15_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB15_6:
+; RV32I-NEXT: srl a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask_or_128_or_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: ori a1, a1, 128
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask_or_128_or_64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ori a2, a2, 128
+; RV32ZBB-NEXT: li a3, 32
+; RV32ZBB-NEXT: bltu a2, a3, .LBB15_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a4, a1, a2
+; RV32ZBB-NEXT: bnez a2, .LBB15_3
+; RV32ZBB-NEXT: j .LBB15_4
+; RV32ZBB-NEXT: .LBB15_2:
+; RV32ZBB-NEXT: srl a4, a0, a2
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: sll a5, a1, a5
+; RV32ZBB-NEXT: or a4, a4, a5
+; RV32ZBB-NEXT: beqz a2, .LBB15_4
+; RV32ZBB-NEXT: .LBB15_3:
+; RV32ZBB-NEXT: mv a0, a4
+; RV32ZBB-NEXT: .LBB15_4:
+; RV32ZBB-NEXT: bltu a2, a3, .LBB15_6
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: ret
+; RV32ZBB-NEXT: .LBB15_6:
+; RV32ZBB-NEXT: srl a1, a1, a2
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask_or_128_or_64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ori a1, a1, 128
+; RV64ZBB-NEXT: srl a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask_or_128_or_64:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: ori a2, a2, 128
+; RV32XTHEADBB-NEXT: li a3, 32
+; RV32XTHEADBB-NEXT: bltu a2, a3, .LBB15_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a4, a1, a2
+; RV32XTHEADBB-NEXT: bnez a2, .LBB15_3
+; RV32XTHEADBB-NEXT: j .LBB15_4
+; RV32XTHEADBB-NEXT: .LBB15_2:
+; RV32XTHEADBB-NEXT: srl a4, a0, a2
+; RV32XTHEADBB-NEXT: neg a5, a2
+; RV32XTHEADBB-NEXT: sll a5, a1, a5
+; RV32XTHEADBB-NEXT: or a4, a4, a5
+; RV32XTHEADBB-NEXT: beqz a2, .LBB15_4
+; RV32XTHEADBB-NEXT: .LBB15_3:
+; RV32XTHEADBB-NEXT: mv a0, a4
+; RV32XTHEADBB-NEXT: .LBB15_4:
+; RV32XTHEADBB-NEXT: bltu a2, a3, .LBB15_6
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: ret
+; RV32XTHEADBB-NEXT: .LBB15_6:
+; RV32XTHEADBB-NEXT: srl a1, a1, a2
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask_or_128_or_64:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: ori a1, a1, 128
+; RV64XTHEADBB-NEXT: srl a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %a = or i64 %y, 128
+ %b = lshr i64 %x, %a
+ %c = sub i64 0, %y
+ %d = or i64 %c, 64
+ %e = shl i64 %x, %d
+ %f = or i64 %b, %e
+ ret i64 %f
+}
+
+; Test that we're able to remove a mask on the rotate amount that has more than
+; one use.
+define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind {
+; RV32I-LABEL: rotl_32_mask_shared:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a3, a2, 31
+; RV32I-NEXT: sll a4, a0, a2
+; RV32I-NEXT: neg a3, a3
+; RV32I-NEXT: srl a0, a0, a3
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: sll a1, a1, a2
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask_shared:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 31
+; RV64I-NEXT: sllw a4, a0, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: srlw a0, a0, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: sllw a1, a1, a2
+; RV64I-NEXT: addw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask_shared:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: rol a0, a0, a2
+; RV32ZBB-NEXT: sll a1, a1, a2
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask_shared:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rolw a0, a0, a2
+; RV64ZBB-NEXT: sllw a1, a1, a2
+; RV64ZBB-NEXT: addw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask_shared:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a3, a2, 31
+; RV32XTHEADBB-NEXT: sll a4, a0, a2
+; RV32XTHEADBB-NEXT: neg a3, a3
+; RV32XTHEADBB-NEXT: srl a0, a0, a3
+; RV32XTHEADBB-NEXT: or a0, a4, a0
+; RV32XTHEADBB-NEXT: sll a1, a1, a2
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask_shared:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 31
+; RV64XTHEADBB-NEXT: sllw a4, a0, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: srlw a0, a0, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: sllw a1, a1, a2
+; RV64XTHEADBB-NEXT: addw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i32 %amt, 31
+ %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %maskedamt)
+ %2 = shl i32 %b, %maskedamt
+ %3 = add i32 %1, %2
+ ret i32 %3
+}
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+
+define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind {
+; RV32I-LABEL: rotl_64_mask_shared:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a5, a4, 63
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: neg a7, a5
+; RV32I-NEXT: bltu a5, t0, .LBB17_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll t3, a0, a5
+; RV32I-NEXT: j .LBB17_3
+; RV32I-NEXT: .LBB17_2:
+; RV32I-NEXT: sll a6, a0, a4
+; RV32I-NEXT: srl t1, a0, a7
+; RV32I-NEXT: sll t2, a1, a4
+; RV32I-NEXT: or t3, t1, t2
+; RV32I-NEXT: .LBB17_3:
+; RV32I-NEXT: neg t2, a5
+; RV32I-NEXT: mv t1, a1
+; RV32I-NEXT: beqz a5, .LBB17_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t1, t3
+; RV32I-NEXT: .LBB17_5:
+; RV32I-NEXT: andi t3, t2, 63
+; RV32I-NEXT: bltu t3, t0, .LBB17_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t4, a1, t3
+; RV32I-NEXT: bnez t3, .LBB17_8
+; RV32I-NEXT: j .LBB17_9
+; RV32I-NEXT: .LBB17_7:
+; RV32I-NEXT: srl t4, a0, t2
+; RV32I-NEXT: neg t5, t3
+; RV32I-NEXT: sll t5, a1, t5
+; RV32I-NEXT: or t4, t4, t5
+; RV32I-NEXT: beqz t3, .LBB17_9
+; RV32I-NEXT: .LBB17_8:
+; RV32I-NEXT: mv a0, t4
+; RV32I-NEXT: .LBB17_9:
+; RV32I-NEXT: bltu t3, t0, .LBB17_12
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: bgeu a5, t0, .LBB17_13
+; RV32I-NEXT: .LBB17_11:
+; RV32I-NEXT: sll t0, a2, a4
+; RV32I-NEXT: srl a2, a2, a7
+; RV32I-NEXT: sll a4, a3, a4
+; RV32I-NEXT: or a2, a2, a4
+; RV32I-NEXT: j .LBB17_14
+; RV32I-NEXT: .LBB17_12:
+; RV32I-NEXT: srl a1, a1, t2
+; RV32I-NEXT: bltu a5, t0, .LBB17_11
+; RV32I-NEXT: .LBB17_13:
+; RV32I-NEXT: li t0, 0
+; RV32I-NEXT: sll a2, a2, a5
+; RV32I-NEXT: .LBB17_14:
+; RV32I-NEXT: or a0, a6, a0
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: beqz a5, .LBB17_16
+; RV32I-NEXT: # %bb.15:
+; RV32I-NEXT: mv a3, a2
+; RV32I-NEXT: .LBB17_16:
+; RV32I-NEXT: add a0, a0, t0
+; RV32I-NEXT: sltu a2, a0, t0
+; RV32I-NEXT: add a1, a1, a3
+; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask_shared:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 63
+; RV64I-NEXT: sll a4, a0, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: srl a0, a0, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: sll a1, a1, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask_shared:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a5, a4, 63
+; RV32ZBB-NEXT: li t0, 32
+; RV32ZBB-NEXT: neg a7, a5
+; RV32ZBB-NEXT: bltu a5, t0, .LBB17_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a6, 0
+; RV32ZBB-NEXT: sll t3, a0, a5
+; RV32ZBB-NEXT: j .LBB17_3
+; RV32ZBB-NEXT: .LBB17_2:
+; RV32ZBB-NEXT: sll a6, a0, a4
+; RV32ZBB-NEXT: srl t1, a0, a7
+; RV32ZBB-NEXT: sll t2, a1, a4
+; RV32ZBB-NEXT: or t3, t1, t2
+; RV32ZBB-NEXT: .LBB17_3:
+; RV32ZBB-NEXT: neg t2, a5
+; RV32ZBB-NEXT: mv t1, a1
+; RV32ZBB-NEXT: beqz a5, .LBB17_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv t1, t3
+; RV32ZBB-NEXT: .LBB17_5:
+; RV32ZBB-NEXT: andi t3, t2, 63
+; RV32ZBB-NEXT: bltu t3, t0, .LBB17_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl t4, a1, t3
+; RV32ZBB-NEXT: bnez t3, .LBB17_8
+; RV32ZBB-NEXT: j .LBB17_9
+; RV32ZBB-NEXT: .LBB17_7:
+; RV32ZBB-NEXT: srl t4, a0, t2
+; RV32ZBB-NEXT: neg t5, t3
+; RV32ZBB-NEXT: sll t5, a1, t5
+; RV32ZBB-NEXT: or t4, t4, t5
+; RV32ZBB-NEXT: beqz t3, .LBB17_9
+; RV32ZBB-NEXT: .LBB17_8:
+; RV32ZBB-NEXT: mv a0, t4
+; RV32ZBB-NEXT: .LBB17_9:
+; RV32ZBB-NEXT: bltu t3, t0, .LBB17_12
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: bgeu a5, t0, .LBB17_13
+; RV32ZBB-NEXT: .LBB17_11:
+; RV32ZBB-NEXT: sll t0, a2, a4
+; RV32ZBB-NEXT: srl a2, a2, a7
+; RV32ZBB-NEXT: sll a4, a3, a4
+; RV32ZBB-NEXT: or a2, a2, a4
+; RV32ZBB-NEXT: j .LBB17_14
+; RV32ZBB-NEXT: .LBB17_12:
+; RV32ZBB-NEXT: srl a1, a1, t2
+; RV32ZBB-NEXT: bltu a5, t0, .LBB17_11
+; RV32ZBB-NEXT: .LBB17_13:
+; RV32ZBB-NEXT: li t0, 0
+; RV32ZBB-NEXT: sll a2, a2, a5
+; RV32ZBB-NEXT: .LBB17_14:
+; RV32ZBB-NEXT: or a0, a6, a0
+; RV32ZBB-NEXT: or a1, t1, a1
+; RV32ZBB-NEXT: beqz a5, .LBB17_16
+; RV32ZBB-NEXT: # %bb.15:
+; RV32ZBB-NEXT: mv a3, a2
+; RV32ZBB-NEXT: .LBB17_16:
+; RV32ZBB-NEXT: add a0, a0, t0
+; RV32ZBB-NEXT: sltu a2, a0, t0
+; RV32ZBB-NEXT: add a1, a1, a3
+; RV32ZBB-NEXT: add a1, a1, a2
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask_shared:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rol a0, a0, a2
+; RV64ZBB-NEXT: sll a1, a1, a2
+; RV64ZBB-NEXT: add a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask_shared:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a5, a4, 63
+; RV32XTHEADBB-NEXT: li t0, 32
+; RV32XTHEADBB-NEXT: neg a7, a5
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB17_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a6, 0
+; RV32XTHEADBB-NEXT: sll t3, a0, a5
+; RV32XTHEADBB-NEXT: j .LBB17_3
+; RV32XTHEADBB-NEXT: .LBB17_2:
+; RV32XTHEADBB-NEXT: sll a6, a0, a4
+; RV32XTHEADBB-NEXT: srl t1, a0, a7
+; RV32XTHEADBB-NEXT: sll t2, a1, a4
+; RV32XTHEADBB-NEXT: or t3, t1, t2
+; RV32XTHEADBB-NEXT: .LBB17_3:
+; RV32XTHEADBB-NEXT: neg t2, a5
+; RV32XTHEADBB-NEXT: mv t1, a1
+; RV32XTHEADBB-NEXT: beqz a5, .LBB17_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv t1, t3
+; RV32XTHEADBB-NEXT: .LBB17_5:
+; RV32XTHEADBB-NEXT: andi t3, t2, 63
+; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB17_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl t4, a1, t3
+; RV32XTHEADBB-NEXT: bnez t3, .LBB17_8
+; RV32XTHEADBB-NEXT: j .LBB17_9
+; RV32XTHEADBB-NEXT: .LBB17_7:
+; RV32XTHEADBB-NEXT: srl t4, a0, t2
+; RV32XTHEADBB-NEXT: neg t5, t3
+; RV32XTHEADBB-NEXT: sll t5, a1, t5
+; RV32XTHEADBB-NEXT: or t4, t4, t5
+; RV32XTHEADBB-NEXT: beqz t3, .LBB17_9
+; RV32XTHEADBB-NEXT: .LBB17_8:
+; RV32XTHEADBB-NEXT: mv a0, t4
+; RV32XTHEADBB-NEXT: .LBB17_9:
+; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB17_12
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: bgeu a5, t0, .LBB17_13
+; RV32XTHEADBB-NEXT: .LBB17_11:
+; RV32XTHEADBB-NEXT: sll t0, a2, a4
+; RV32XTHEADBB-NEXT: srl a2, a2, a7
+; RV32XTHEADBB-NEXT: sll a4, a3, a4
+; RV32XTHEADBB-NEXT: or a2, a2, a4
+; RV32XTHEADBB-NEXT: j .LBB17_14
+; RV32XTHEADBB-NEXT: .LBB17_12:
+; RV32XTHEADBB-NEXT: srl a1, a1, t2
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB17_11
+; RV32XTHEADBB-NEXT: .LBB17_13:
+; RV32XTHEADBB-NEXT: li t0, 0
+; RV32XTHEADBB-NEXT: sll a2, a2, a5
+; RV32XTHEADBB-NEXT: .LBB17_14:
+; RV32XTHEADBB-NEXT: or a0, a6, a0
+; RV32XTHEADBB-NEXT: or a1, t1, a1
+; RV32XTHEADBB-NEXT: beqz a5, .LBB17_16
+; RV32XTHEADBB-NEXT: # %bb.15:
+; RV32XTHEADBB-NEXT: mv a3, a2
+; RV32XTHEADBB-NEXT: .LBB17_16:
+; RV32XTHEADBB-NEXT: add a0, a0, t0
+; RV32XTHEADBB-NEXT: sltu a2, a0, t0
+; RV32XTHEADBB-NEXT: add a1, a1, a3
+; RV32XTHEADBB-NEXT: add a1, a1, a2
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask_shared:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 63
+; RV64XTHEADBB-NEXT: sll a4, a0, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: srl a0, a0, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: sll a1, a1, a2
+; RV64XTHEADBB-NEXT: add a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i64 %amt, 63
+ %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %maskedamt)
+ %2 = shl i64 %b, %maskedamt
+ %3 = add i64 %1, %2
+ ret i64 %3
+}
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+
+define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind {
+; RV32I-LABEL: rotr_32_mask_shared:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a3, a2, 31
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a3, a3
+; RV32I-NEXT: sll a0, a0, a3
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: sll a1, a1, a2
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask_shared:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 31
+; RV64I-NEXT: srlw a4, a0, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: sllw a0, a0, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: sllw a1, a1, a2
+; RV64I-NEXT: addw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask_shared:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ror a0, a0, a2
+; RV32ZBB-NEXT: sll a1, a1, a2
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask_shared:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rorw a0, a0, a2
+; RV64ZBB-NEXT: sllw a1, a1, a2
+; RV64ZBB-NEXT: addw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask_shared:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a3, a2, 31
+; RV32XTHEADBB-NEXT: srl a4, a0, a2
+; RV32XTHEADBB-NEXT: neg a3, a3
+; RV32XTHEADBB-NEXT: sll a0, a0, a3
+; RV32XTHEADBB-NEXT: or a0, a4, a0
+; RV32XTHEADBB-NEXT: sll a1, a1, a2
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask_shared:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 31
+; RV64XTHEADBB-NEXT: srlw a4, a0, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: sllw a0, a0, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: sllw a1, a1, a2
+; RV64XTHEADBB-NEXT: addw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i32 %amt, 31
+ %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %maskedamt)
+ %2 = shl i32 %b, %maskedamt
+ %3 = add i32 %1, %2
+ ret i32 %3
+}
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind {
+; RV32I-LABEL: rotr_64_mask_shared:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a5, a4, 63
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: neg a6, a5
+; RV32I-NEXT: bltu a5, t0, .LBB19_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl t1, a1, a5
+; RV32I-NEXT: mv a7, a0
+; RV32I-NEXT: bnez a5, .LBB19_3
+; RV32I-NEXT: j .LBB19_4
+; RV32I-NEXT: .LBB19_2:
+; RV32I-NEXT: srl a7, a0, a4
+; RV32I-NEXT: sll t1, a1, a6
+; RV32I-NEXT: or t1, a7, t1
+; RV32I-NEXT: mv a7, a0
+; RV32I-NEXT: beqz a5, .LBB19_4
+; RV32I-NEXT: .LBB19_3:
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: .LBB19_4:
+; RV32I-NEXT: neg t4, a5
+; RV32I-NEXT: bltu a5, t0, .LBB19_7
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li t1, 0
+; RV32I-NEXT: andi t3, t4, 63
+; RV32I-NEXT: bgeu t3, t0, .LBB19_8
+; RV32I-NEXT: .LBB19_6:
+; RV32I-NEXT: sll t2, a0, t4
+; RV32I-NEXT: neg t5, t3
+; RV32I-NEXT: srl a0, a0, t5
+; RV32I-NEXT: sll t4, a1, t4
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: bnez t3, .LBB19_9
+; RV32I-NEXT: j .LBB19_10
+; RV32I-NEXT: .LBB19_7:
+; RV32I-NEXT: srl t1, a1, a4
+; RV32I-NEXT: andi t3, t4, 63
+; RV32I-NEXT: bltu t3, t0, .LBB19_6
+; RV32I-NEXT: .LBB19_8:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: sll a0, a0, t3
+; RV32I-NEXT: beqz t3, .LBB19_10
+; RV32I-NEXT: .LBB19_9:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB19_10:
+; RV32I-NEXT: bltu a5, t0, .LBB19_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t0, 0
+; RV32I-NEXT: sll a0, a2, a5
+; RV32I-NEXT: j .LBB19_13
+; RV32I-NEXT: .LBB19_12:
+; RV32I-NEXT: sll t0, a2, a4
+; RV32I-NEXT: srl a0, a2, a6
+; RV32I-NEXT: sll a2, a3, a4
+; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: .LBB19_13:
+; RV32I-NEXT: or a2, a7, t2
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: beqz a5, .LBB19_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: .LBB19_15:
+; RV32I-NEXT: add a0, a2, t0
+; RV32I-NEXT: sltu a2, a0, t0
+; RV32I-NEXT: add a1, a1, a3
+; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask_shared:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 63
+; RV64I-NEXT: srl a4, a0, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: sll a0, a0, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: sll a1, a1, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask_shared:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi a5, a4, 63
+; RV32ZBB-NEXT: li t0, 32
+; RV32ZBB-NEXT: neg a6, a5
+; RV32ZBB-NEXT: bltu a5, t0, .LBB19_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl t1, a1, a5
+; RV32ZBB-NEXT: mv a7, a0
+; RV32ZBB-NEXT: bnez a5, .LBB19_3
+; RV32ZBB-NEXT: j .LBB19_4
+; RV32ZBB-NEXT: .LBB19_2:
+; RV32ZBB-NEXT: srl a7, a0, a4
+; RV32ZBB-NEXT: sll t1, a1, a6
+; RV32ZBB-NEXT: or t1, a7, t1
+; RV32ZBB-NEXT: mv a7, a0
+; RV32ZBB-NEXT: beqz a5, .LBB19_4
+; RV32ZBB-NEXT: .LBB19_3:
+; RV32ZBB-NEXT: mv a7, t1
+; RV32ZBB-NEXT: .LBB19_4:
+; RV32ZBB-NEXT: neg t4, a5
+; RV32ZBB-NEXT: bltu a5, t0, .LBB19_7
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li t1, 0
+; RV32ZBB-NEXT: andi t3, t4, 63
+; RV32ZBB-NEXT: bgeu t3, t0, .LBB19_8
+; RV32ZBB-NEXT: .LBB19_6:
+; RV32ZBB-NEXT: sll t2, a0, t4
+; RV32ZBB-NEXT: neg t5, t3
+; RV32ZBB-NEXT: srl a0, a0, t5
+; RV32ZBB-NEXT: sll t4, a1, t4
+; RV32ZBB-NEXT: or a0, a0, t4
+; RV32ZBB-NEXT: bnez t3, .LBB19_9
+; RV32ZBB-NEXT: j .LBB19_10
+; RV32ZBB-NEXT: .LBB19_7:
+; RV32ZBB-NEXT: srl t1, a1, a4
+; RV32ZBB-NEXT: andi t3, t4, 63
+; RV32ZBB-NEXT: bltu t3, t0, .LBB19_6
+; RV32ZBB-NEXT: .LBB19_8:
+; RV32ZBB-NEXT: li t2, 0
+; RV32ZBB-NEXT: sll a0, a0, t3
+; RV32ZBB-NEXT: beqz t3, .LBB19_10
+; RV32ZBB-NEXT: .LBB19_9:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB19_10:
+; RV32ZBB-NEXT: bltu a5, t0, .LBB19_12
+; RV32ZBB-NEXT: # %bb.11:
+; RV32ZBB-NEXT: li t0, 0
+; RV32ZBB-NEXT: sll a0, a2, a5
+; RV32ZBB-NEXT: j .LBB19_13
+; RV32ZBB-NEXT: .LBB19_12:
+; RV32ZBB-NEXT: sll t0, a2, a4
+; RV32ZBB-NEXT: srl a0, a2, a6
+; RV32ZBB-NEXT: sll a2, a3, a4
+; RV32ZBB-NEXT: or a0, a0, a2
+; RV32ZBB-NEXT: .LBB19_13:
+; RV32ZBB-NEXT: or a2, a7, t2
+; RV32ZBB-NEXT: or a1, t1, a1
+; RV32ZBB-NEXT: beqz a5, .LBB19_15
+; RV32ZBB-NEXT: # %bb.14:
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: .LBB19_15:
+; RV32ZBB-NEXT: add a0, a2, t0
+; RV32ZBB-NEXT: sltu a2, a0, t0
+; RV32ZBB-NEXT: add a1, a1, a3
+; RV32ZBB-NEXT: add a1, a1, a2
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask_shared:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ror a0, a0, a2
+; RV64ZBB-NEXT: sll a1, a1, a2
+; RV64ZBB-NEXT: add a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask_shared:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a5, a4, 63
+; RV32XTHEADBB-NEXT: li t0, 32
+; RV32XTHEADBB-NEXT: neg a6, a5
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl t1, a1, a5
+; RV32XTHEADBB-NEXT: mv a7, a0
+; RV32XTHEADBB-NEXT: bnez a5, .LBB19_3
+; RV32XTHEADBB-NEXT: j .LBB19_4
+; RV32XTHEADBB-NEXT: .LBB19_2:
+; RV32XTHEADBB-NEXT: srl a7, a0, a4
+; RV32XTHEADBB-NEXT: sll t1, a1, a6
+; RV32XTHEADBB-NEXT: or t1, a7, t1
+; RV32XTHEADBB-NEXT: mv a7, a0
+; RV32XTHEADBB-NEXT: beqz a5, .LBB19_4
+; RV32XTHEADBB-NEXT: .LBB19_3:
+; RV32XTHEADBB-NEXT: mv a7, t1
+; RV32XTHEADBB-NEXT: .LBB19_4:
+; RV32XTHEADBB-NEXT: neg t4, a5
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_7
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li t1, 0
+; RV32XTHEADBB-NEXT: andi t3, t4, 63
+; RV32XTHEADBB-NEXT: bgeu t3, t0, .LBB19_8
+; RV32XTHEADBB-NEXT: .LBB19_6:
+; RV32XTHEADBB-NEXT: sll t2, a0, t4
+; RV32XTHEADBB-NEXT: neg t5, t3
+; RV32XTHEADBB-NEXT: srl a0, a0, t5
+; RV32XTHEADBB-NEXT: sll t4, a1, t4
+; RV32XTHEADBB-NEXT: or a0, a0, t4
+; RV32XTHEADBB-NEXT: bnez t3, .LBB19_9
+; RV32XTHEADBB-NEXT: j .LBB19_10
+; RV32XTHEADBB-NEXT: .LBB19_7:
+; RV32XTHEADBB-NEXT: srl t1, a1, a4
+; RV32XTHEADBB-NEXT: andi t3, t4, 63
+; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB19_6
+; RV32XTHEADBB-NEXT: .LBB19_8:
+; RV32XTHEADBB-NEXT: li t2, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, t3
+; RV32XTHEADBB-NEXT: beqz t3, .LBB19_10
+; RV32XTHEADBB-NEXT: .LBB19_9:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB19_10:
+; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_12
+; RV32XTHEADBB-NEXT: # %bb.11:
+; RV32XTHEADBB-NEXT: li t0, 0
+; RV32XTHEADBB-NEXT: sll a0, a2, a5
+; RV32XTHEADBB-NEXT: j .LBB19_13
+; RV32XTHEADBB-NEXT: .LBB19_12:
+; RV32XTHEADBB-NEXT: sll t0, a2, a4
+; RV32XTHEADBB-NEXT: srl a0, a2, a6
+; RV32XTHEADBB-NEXT: sll a2, a3, a4
+; RV32XTHEADBB-NEXT: or a0, a0, a2
+; RV32XTHEADBB-NEXT: .LBB19_13:
+; RV32XTHEADBB-NEXT: or a2, a7, t2
+; RV32XTHEADBB-NEXT: or a1, t1, a1
+; RV32XTHEADBB-NEXT: beqz a5, .LBB19_15
+; RV32XTHEADBB-NEXT: # %bb.14:
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: .LBB19_15:
+; RV32XTHEADBB-NEXT: add a0, a2, t0
+; RV32XTHEADBB-NEXT: sltu a2, a0, t0
+; RV32XTHEADBB-NEXT: add a1, a1, a3
+; RV32XTHEADBB-NEXT: add a1, a1, a2
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask_shared:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 63
+; RV64XTHEADBB-NEXT: srl a4, a0, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: sll a0, a0, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: sll a1, a1, a2
+; RV64XTHEADBB-NEXT: add a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i64 %amt, 63
+ %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %maskedamt)
+ %2 = shl i64 %b, %maskedamt
+ %3 = add i64 %1, %2
+ ret i64 %3
+}
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind {
+; RV32I-LABEL: rotl_32_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a3, a2, 31
+; RV32I-NEXT: sll a4, a0, a2
+; RV32I-NEXT: sll a2, a1, a2
+; RV32I-NEXT: neg a3, a3
+; RV32I-NEXT: srl a0, a0, a3
+; RV32I-NEXT: srl a1, a1, a3
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_32_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 31
+; RV64I-NEXT: sllw a4, a0, a2
+; RV64I-NEXT: sllw a2, a1, a2
+; RV64I-NEXT: neg a5, a3
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: srlw a0, a0, a5
+; RV64I-NEXT: srlw a1, a1, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: addw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_32_mask_multiple:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: rol a0, a0, a2
+; RV32ZBB-NEXT: rol a1, a1, a2
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_32_mask_multiple:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rolw a0, a0, a2
+; RV64ZBB-NEXT: rolw a1, a1, a2
+; RV64ZBB-NEXT: addw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_32_mask_multiple:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a3, a2, 31
+; RV32XTHEADBB-NEXT: sll a4, a0, a2
+; RV32XTHEADBB-NEXT: sll a2, a1, a2
+; RV32XTHEADBB-NEXT: neg a3, a3
+; RV32XTHEADBB-NEXT: srl a0, a0, a3
+; RV32XTHEADBB-NEXT: srl a1, a1, a3
+; RV32XTHEADBB-NEXT: or a0, a4, a0
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_32_mask_multiple:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 31
+; RV64XTHEADBB-NEXT: sllw a4, a0, a2
+; RV64XTHEADBB-NEXT: sllw a2, a1, a2
+; RV64XTHEADBB-NEXT: neg a5, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: srlw a0, a0, a5
+; RV64XTHEADBB-NEXT: srlw a1, a1, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: or a1, a2, a1
+; RV64XTHEADBB-NEXT: addw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i32 %amt, 31
+ %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %maskedamt)
+ %2 = tail call i32 @llvm.fshl.i32(i32 %b, i32 %b, i32 %maskedamt)
+ %3 = add i32 %1, %2
+ ret i32 %3
+}
+
+define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
+; RV32I-LABEL: rotl_64_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi t1, a4, 63
+; RV32I-NEXT: li a5, 32
+; RV32I-NEXT: neg t3, t1
+; RV32I-NEXT: bltu t1, a5, .LBB21_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll t2, a0, t1
+; RV32I-NEXT: j .LBB21_3
+; RV32I-NEXT: .LBB21_2:
+; RV32I-NEXT: sll a6, a0, a4
+; RV32I-NEXT: srl a7, a0, t3
+; RV32I-NEXT: sll t0, a1, a4
+; RV32I-NEXT: or t2, a7, t0
+; RV32I-NEXT: .LBB21_3:
+; RV32I-NEXT: neg a7, t1
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: beqz t1, .LBB21_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t0, t2
+; RV32I-NEXT: .LBB21_5:
+; RV32I-NEXT: andi t2, a7, 63
+; RV32I-NEXT: neg t4, t2
+; RV32I-NEXT: bltu t2, a5, .LBB21_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t5, a1, t2
+; RV32I-NEXT: bnez t2, .LBB21_8
+; RV32I-NEXT: j .LBB21_9
+; RV32I-NEXT: .LBB21_7:
+; RV32I-NEXT: srl t5, a0, a7
+; RV32I-NEXT: sll t6, a1, t4
+; RV32I-NEXT: or t5, t5, t6
+; RV32I-NEXT: beqz t2, .LBB21_9
+; RV32I-NEXT: .LBB21_8:
+; RV32I-NEXT: mv a0, t5
+; RV32I-NEXT: .LBB21_9:
+; RV32I-NEXT: bltu t2, a5, .LBB21_12
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: bgeu t1, a5, .LBB21_13
+; RV32I-NEXT: .LBB21_11:
+; RV32I-NEXT: sll a1, a2, a4
+; RV32I-NEXT: srl t3, a2, t3
+; RV32I-NEXT: sll a4, a3, a4
+; RV32I-NEXT: or t3, t3, a4
+; RV32I-NEXT: mv a4, a3
+; RV32I-NEXT: bnez t1, .LBB21_14
+; RV32I-NEXT: j .LBB21_15
+; RV32I-NEXT: .LBB21_12:
+; RV32I-NEXT: srl t5, a1, a7
+; RV32I-NEXT: bltu t1, a5, .LBB21_11
+; RV32I-NEXT: .LBB21_13:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sll t3, a2, t1
+; RV32I-NEXT: mv a4, a3
+; RV32I-NEXT: beqz t1, .LBB21_15
+; RV32I-NEXT: .LBB21_14:
+; RV32I-NEXT: mv a4, t3
+; RV32I-NEXT: .LBB21_15:
+; RV32I-NEXT: bltu t2, a5, .LBB21_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: srl t1, a3, t2
+; RV32I-NEXT: bnez t2, .LBB21_18
+; RV32I-NEXT: j .LBB21_19
+; RV32I-NEXT: .LBB21_17:
+; RV32I-NEXT: srl t1, a2, a7
+; RV32I-NEXT: sll t3, a3, t4
+; RV32I-NEXT: or t1, t1, t3
+; RV32I-NEXT: beqz t2, .LBB21_19
+; RV32I-NEXT: .LBB21_18:
+; RV32I-NEXT: mv a2, t1
+; RV32I-NEXT: .LBB21_19:
+; RV32I-NEXT: or a0, a6, a0
+; RV32I-NEXT: or a6, t0, t5
+; RV32I-NEXT: bltu t2, a5, .LBB21_21
+; RV32I-NEXT: # %bb.20:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: j .LBB21_22
+; RV32I-NEXT: .LBB21_21:
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: .LBB21_22:
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: sltu a1, a0, a1
+; RV32I-NEXT: add a3, a6, a3
+; RV32I-NEXT: add a1, a3, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 63
+; RV64I-NEXT: sll a4, a0, a2
+; RV64I-NEXT: sll a2, a1, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: srl a0, a0, a3
+; RV64I-NEXT: srl a1, a1, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_mask_multiple:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi t1, a4, 63
+; RV32ZBB-NEXT: li a5, 32
+; RV32ZBB-NEXT: neg t3, t1
+; RV32ZBB-NEXT: bltu t1, a5, .LBB21_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a6, 0
+; RV32ZBB-NEXT: sll t2, a0, t1
+; RV32ZBB-NEXT: j .LBB21_3
+; RV32ZBB-NEXT: .LBB21_2:
+; RV32ZBB-NEXT: sll a6, a0, a4
+; RV32ZBB-NEXT: srl a7, a0, t3
+; RV32ZBB-NEXT: sll t0, a1, a4
+; RV32ZBB-NEXT: or t2, a7, t0
+; RV32ZBB-NEXT: .LBB21_3:
+; RV32ZBB-NEXT: neg a7, t1
+; RV32ZBB-NEXT: mv t0, a1
+; RV32ZBB-NEXT: beqz t1, .LBB21_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv t0, t2
+; RV32ZBB-NEXT: .LBB21_5:
+; RV32ZBB-NEXT: andi t2, a7, 63
+; RV32ZBB-NEXT: neg t4, t2
+; RV32ZBB-NEXT: bltu t2, a5, .LBB21_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl t5, a1, t2
+; RV32ZBB-NEXT: bnez t2, .LBB21_8
+; RV32ZBB-NEXT: j .LBB21_9
+; RV32ZBB-NEXT: .LBB21_7:
+; RV32ZBB-NEXT: srl t5, a0, a7
+; RV32ZBB-NEXT: sll t6, a1, t4
+; RV32ZBB-NEXT: or t5, t5, t6
+; RV32ZBB-NEXT: beqz t2, .LBB21_9
+; RV32ZBB-NEXT: .LBB21_8:
+; RV32ZBB-NEXT: mv a0, t5
+; RV32ZBB-NEXT: .LBB21_9:
+; RV32ZBB-NEXT: bltu t2, a5, .LBB21_12
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li t5, 0
+; RV32ZBB-NEXT: bgeu t1, a5, .LBB21_13
+; RV32ZBB-NEXT: .LBB21_11:
+; RV32ZBB-NEXT: sll a1, a2, a4
+; RV32ZBB-NEXT: srl t3, a2, t3
+; RV32ZBB-NEXT: sll a4, a3, a4
+; RV32ZBB-NEXT: or t3, t3, a4
+; RV32ZBB-NEXT: mv a4, a3
+; RV32ZBB-NEXT: bnez t1, .LBB21_14
+; RV32ZBB-NEXT: j .LBB21_15
+; RV32ZBB-NEXT: .LBB21_12:
+; RV32ZBB-NEXT: srl t5, a1, a7
+; RV32ZBB-NEXT: bltu t1, a5, .LBB21_11
+; RV32ZBB-NEXT: .LBB21_13:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: sll t3, a2, t1
+; RV32ZBB-NEXT: mv a4, a3
+; RV32ZBB-NEXT: beqz t1, .LBB21_15
+; RV32ZBB-NEXT: .LBB21_14:
+; RV32ZBB-NEXT: mv a4, t3
+; RV32ZBB-NEXT: .LBB21_15:
+; RV32ZBB-NEXT: bltu t2, a5, .LBB21_17
+; RV32ZBB-NEXT: # %bb.16:
+; RV32ZBB-NEXT: srl t1, a3, t2
+; RV32ZBB-NEXT: bnez t2, .LBB21_18
+; RV32ZBB-NEXT: j .LBB21_19
+; RV32ZBB-NEXT: .LBB21_17:
+; RV32ZBB-NEXT: srl t1, a2, a7
+; RV32ZBB-NEXT: sll t3, a3, t4
+; RV32ZBB-NEXT: or t1, t1, t3
+; RV32ZBB-NEXT: beqz t2, .LBB21_19
+; RV32ZBB-NEXT: .LBB21_18:
+; RV32ZBB-NEXT: mv a2, t1
+; RV32ZBB-NEXT: .LBB21_19:
+; RV32ZBB-NEXT: or a0, a6, a0
+; RV32ZBB-NEXT: or a6, t0, t5
+; RV32ZBB-NEXT: bltu t2, a5, .LBB21_21
+; RV32ZBB-NEXT: # %bb.20:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: j .LBB21_22
+; RV32ZBB-NEXT: .LBB21_21:
+; RV32ZBB-NEXT: srl a3, a3, a7
+; RV32ZBB-NEXT: .LBB21_22:
+; RV32ZBB-NEXT: or a1, a1, a2
+; RV32ZBB-NEXT: or a3, a4, a3
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: sltu a1, a0, a1
+; RV32ZBB-NEXT: add a3, a6, a3
+; RV32ZBB-NEXT: add a1, a3, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_mask_multiple:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rol a0, a0, a2
+; RV64ZBB-NEXT: rol a1, a1, a2
+; RV64ZBB-NEXT: add a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_mask_multiple:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi t1, a4, 63
+; RV32XTHEADBB-NEXT: li a5, 32
+; RV32XTHEADBB-NEXT: neg t3, t1
+; RV32XTHEADBB-NEXT: bltu t1, a5, .LBB21_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a6, 0
+; RV32XTHEADBB-NEXT: sll t2, a0, t1
+; RV32XTHEADBB-NEXT: j .LBB21_3
+; RV32XTHEADBB-NEXT: .LBB21_2:
+; RV32XTHEADBB-NEXT: sll a6, a0, a4
+; RV32XTHEADBB-NEXT: srl a7, a0, t3
+; RV32XTHEADBB-NEXT: sll t0, a1, a4
+; RV32XTHEADBB-NEXT: or t2, a7, t0
+; RV32XTHEADBB-NEXT: .LBB21_3:
+; RV32XTHEADBB-NEXT: neg a7, t1
+; RV32XTHEADBB-NEXT: mv t0, a1
+; RV32XTHEADBB-NEXT: beqz t1, .LBB21_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv t0, t2
+; RV32XTHEADBB-NEXT: .LBB21_5:
+; RV32XTHEADBB-NEXT: andi t2, a7, 63
+; RV32XTHEADBB-NEXT: neg t4, t2
+; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl t5, a1, t2
+; RV32XTHEADBB-NEXT: bnez t2, .LBB21_8
+; RV32XTHEADBB-NEXT: j .LBB21_9
+; RV32XTHEADBB-NEXT: .LBB21_7:
+; RV32XTHEADBB-NEXT: srl t5, a0, a7
+; RV32XTHEADBB-NEXT: sll t6, a1, t4
+; RV32XTHEADBB-NEXT: or t5, t5, t6
+; RV32XTHEADBB-NEXT: beqz t2, .LBB21_9
+; RV32XTHEADBB-NEXT: .LBB21_8:
+; RV32XTHEADBB-NEXT: mv a0, t5
+; RV32XTHEADBB-NEXT: .LBB21_9:
+; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_12
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li t5, 0
+; RV32XTHEADBB-NEXT: bgeu t1, a5, .LBB21_13
+; RV32XTHEADBB-NEXT: .LBB21_11:
+; RV32XTHEADBB-NEXT: sll a1, a2, a4
+; RV32XTHEADBB-NEXT: srl t3, a2, t3
+; RV32XTHEADBB-NEXT: sll a4, a3, a4
+; RV32XTHEADBB-NEXT: or t3, t3, a4
+; RV32XTHEADBB-NEXT: mv a4, a3
+; RV32XTHEADBB-NEXT: bnez t1, .LBB21_14
+; RV32XTHEADBB-NEXT: j .LBB21_15
+; RV32XTHEADBB-NEXT: .LBB21_12:
+; RV32XTHEADBB-NEXT: srl t5, a1, a7
+; RV32XTHEADBB-NEXT: bltu t1, a5, .LBB21_11
+; RV32XTHEADBB-NEXT: .LBB21_13:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: sll t3, a2, t1
+; RV32XTHEADBB-NEXT: mv a4, a3
+; RV32XTHEADBB-NEXT: beqz t1, .LBB21_15
+; RV32XTHEADBB-NEXT: .LBB21_14:
+; RV32XTHEADBB-NEXT: mv a4, t3
+; RV32XTHEADBB-NEXT: .LBB21_15:
+; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_17
+; RV32XTHEADBB-NEXT: # %bb.16:
+; RV32XTHEADBB-NEXT: srl t1, a3, t2
+; RV32XTHEADBB-NEXT: bnez t2, .LBB21_18
+; RV32XTHEADBB-NEXT: j .LBB21_19
+; RV32XTHEADBB-NEXT: .LBB21_17:
+; RV32XTHEADBB-NEXT: srl t1, a2, a7
+; RV32XTHEADBB-NEXT: sll t3, a3, t4
+; RV32XTHEADBB-NEXT: or t1, t1, t3
+; RV32XTHEADBB-NEXT: beqz t2, .LBB21_19
+; RV32XTHEADBB-NEXT: .LBB21_18:
+; RV32XTHEADBB-NEXT: mv a2, t1
+; RV32XTHEADBB-NEXT: .LBB21_19:
+; RV32XTHEADBB-NEXT: or a0, a6, a0
+; RV32XTHEADBB-NEXT: or a6, t0, t5
+; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_21
+; RV32XTHEADBB-NEXT: # %bb.20:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: j .LBB21_22
+; RV32XTHEADBB-NEXT: .LBB21_21:
+; RV32XTHEADBB-NEXT: srl a3, a3, a7
+; RV32XTHEADBB-NEXT: .LBB21_22:
+; RV32XTHEADBB-NEXT: or a1, a1, a2
+; RV32XTHEADBB-NEXT: or a3, a4, a3
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: sltu a1, a0, a1
+; RV32XTHEADBB-NEXT: add a3, a6, a3
+; RV32XTHEADBB-NEXT: add a1, a3, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_mask_multiple:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 63
+; RV64XTHEADBB-NEXT: sll a4, a0, a2
+; RV64XTHEADBB-NEXT: sll a2, a1, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: srl a0, a0, a3
+; RV64XTHEADBB-NEXT: srl a1, a1, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: or a1, a2, a1
+; RV64XTHEADBB-NEXT: add a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i64 %amt, 63
+ %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %maskedamt)
+ %2 = tail call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 %maskedamt)
+ %3 = add i64 %1, %2
+ ret i64 %3
+}
+
+define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind {
+; RV32I-LABEL: rotr_32_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a3, a2, 31
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: neg a3, a3
+; RV32I-NEXT: sll a0, a0, a3
+; RV32I-NEXT: sll a1, a1, a3
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_32_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 31
+; RV64I-NEXT: srlw a4, a0, a2
+; RV64I-NEXT: srlw a2, a1, a2
+; RV64I-NEXT: neg a5, a3
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: sllw a0, a0, a5
+; RV64I-NEXT: sllw a1, a1, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: addw a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_32_mask_multiple:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: ror a0, a0, a2
+; RV32ZBB-NEXT: ror a1, a1, a2
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_32_mask_multiple:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: rorw a0, a0, a2
+; RV64ZBB-NEXT: rorw a1, a1, a2
+; RV64ZBB-NEXT: addw a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_32_mask_multiple:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi a3, a2, 31
+; RV32XTHEADBB-NEXT: srl a4, a0, a2
+; RV32XTHEADBB-NEXT: srl a2, a1, a2
+; RV32XTHEADBB-NEXT: neg a3, a3
+; RV32XTHEADBB-NEXT: sll a0, a0, a3
+; RV32XTHEADBB-NEXT: sll a1, a1, a3
+; RV32XTHEADBB-NEXT: or a0, a4, a0
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: add a0, a0, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_32_mask_multiple:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 31
+; RV64XTHEADBB-NEXT: srlw a4, a0, a2
+; RV64XTHEADBB-NEXT: srlw a2, a1, a2
+; RV64XTHEADBB-NEXT: neg a5, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: sllw a0, a0, a5
+; RV64XTHEADBB-NEXT: sllw a1, a1, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: or a1, a2, a1
+; RV64XTHEADBB-NEXT: addw a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i32 %amt, 31
+ %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %maskedamt)
+ %2 = tail call i32 @llvm.fshr.i32(i32 %b, i32 %b, i32 %maskedamt)
+ %3 = add i32 %1, %2
+ ret i32 %3
+}
+
+define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
+; RV32I-LABEL: rotr_64_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi t0, a4, 63
+; RV32I-NEXT: li a6, 32
+; RV32I-NEXT: neg t4, t0
+; RV32I-NEXT: bltu t0, a6, .LBB23_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, a1, t0
+; RV32I-NEXT: mv a5, a0
+; RV32I-NEXT: bnez t0, .LBB23_3
+; RV32I-NEXT: j .LBB23_4
+; RV32I-NEXT: .LBB23_2:
+; RV32I-NEXT: srl a5, a0, a4
+; RV32I-NEXT: sll a7, a1, t4
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: mv a5, a0
+; RV32I-NEXT: beqz t0, .LBB23_4
+; RV32I-NEXT: .LBB23_3:
+; RV32I-NEXT: mv a5, a7
+; RV32I-NEXT: .LBB23_4:
+; RV32I-NEXT: neg t2, t0
+; RV32I-NEXT: bltu t0, a6, .LBB23_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: j .LBB23_7
+; RV32I-NEXT: .LBB23_6:
+; RV32I-NEXT: srl a7, a1, a4
+; RV32I-NEXT: .LBB23_7:
+; RV32I-NEXT: andi t1, t2, 63
+; RV32I-NEXT: neg t5, t1
+; RV32I-NEXT: bltu t1, a6, .LBB23_9
+; RV32I-NEXT: # %bb.8:
+; RV32I-NEXT: li t3, 0
+; RV32I-NEXT: sll a0, a0, t1
+; RV32I-NEXT: bnez t1, .LBB23_10
+; RV32I-NEXT: j .LBB23_11
+; RV32I-NEXT: .LBB23_9:
+; RV32I-NEXT: sll t3, a0, t2
+; RV32I-NEXT: srl a0, a0, t5
+; RV32I-NEXT: sll t6, a1, t2
+; RV32I-NEXT: or a0, a0, t6
+; RV32I-NEXT: beqz t1, .LBB23_11
+; RV32I-NEXT: .LBB23_10:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB23_11:
+; RV32I-NEXT: bltu t0, a6, .LBB23_13
+; RV32I-NEXT: # %bb.12:
+; RV32I-NEXT: srl t4, a3, t0
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: bnez t0, .LBB23_14
+; RV32I-NEXT: j .LBB23_15
+; RV32I-NEXT: .LBB23_13:
+; RV32I-NEXT: srl a0, a2, a4
+; RV32I-NEXT: sll t4, a3, t4
+; RV32I-NEXT: or t4, a0, t4
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: beqz t0, .LBB23_15
+; RV32I-NEXT: .LBB23_14:
+; RV32I-NEXT: mv a0, t4
+; RV32I-NEXT: .LBB23_15:
+; RV32I-NEXT: bltu t0, a6, .LBB23_18
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: bgeu t1, a6, .LBB23_19
+; RV32I-NEXT: .LBB23_17:
+; RV32I-NEXT: sll a6, a2, t2
+; RV32I-NEXT: srl a2, a2, t5
+; RV32I-NEXT: sll t0, a3, t2
+; RV32I-NEXT: or a2, a2, t0
+; RV32I-NEXT: j .LBB23_20
+; RV32I-NEXT: .LBB23_18:
+; RV32I-NEXT: srl a4, a3, a4
+; RV32I-NEXT: bltu t1, a6, .LBB23_17
+; RV32I-NEXT: .LBB23_19:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a2, a2, t1
+; RV32I-NEXT: .LBB23_20:
+; RV32I-NEXT: or a5, a5, t3
+; RV32I-NEXT: or a1, a7, a1
+; RV32I-NEXT: beqz t1, .LBB23_22
+; RV32I-NEXT: # %bb.21:
+; RV32I-NEXT: mv a3, a2
+; RV32I-NEXT: .LBB23_22:
+; RV32I-NEXT: or a2, a0, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: add a0, a5, a2
+; RV32I-NEXT: sltu a2, a0, a2
+; RV32I-NEXT: add a1, a1, a3
+; RV32I-NEXT: add a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a3, a2, 63
+; RV64I-NEXT: srl a4, a0, a2
+; RV64I-NEXT: srl a2, a1, a2
+; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: sll a0, a0, a3
+; RV64I-NEXT: sll a1, a1, a3
+; RV64I-NEXT: or a0, a4, a0
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_mask_multiple:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: andi t0, a4, 63
+; RV32ZBB-NEXT: li a6, 32
+; RV32ZBB-NEXT: neg t4, t0
+; RV32ZBB-NEXT: bltu t0, a6, .LBB23_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a7, a1, t0
+; RV32ZBB-NEXT: mv a5, a0
+; RV32ZBB-NEXT: bnez t0, .LBB23_3
+; RV32ZBB-NEXT: j .LBB23_4
+; RV32ZBB-NEXT: .LBB23_2:
+; RV32ZBB-NEXT: srl a5, a0, a4
+; RV32ZBB-NEXT: sll a7, a1, t4
+; RV32ZBB-NEXT: or a7, a5, a7
+; RV32ZBB-NEXT: mv a5, a0
+; RV32ZBB-NEXT: beqz t0, .LBB23_4
+; RV32ZBB-NEXT: .LBB23_3:
+; RV32ZBB-NEXT: mv a5, a7
+; RV32ZBB-NEXT: .LBB23_4:
+; RV32ZBB-NEXT: neg t2, t0
+; RV32ZBB-NEXT: bltu t0, a6, .LBB23_6
+; RV32ZBB-NEXT: # %bb.5:
+; RV32ZBB-NEXT: li a7, 0
+; RV32ZBB-NEXT: j .LBB23_7
+; RV32ZBB-NEXT: .LBB23_6:
+; RV32ZBB-NEXT: srl a7, a1, a4
+; RV32ZBB-NEXT: .LBB23_7:
+; RV32ZBB-NEXT: andi t1, t2, 63
+; RV32ZBB-NEXT: neg t5, t1
+; RV32ZBB-NEXT: bltu t1, a6, .LBB23_9
+; RV32ZBB-NEXT: # %bb.8:
+; RV32ZBB-NEXT: li t3, 0
+; RV32ZBB-NEXT: sll a0, a0, t1
+; RV32ZBB-NEXT: bnez t1, .LBB23_10
+; RV32ZBB-NEXT: j .LBB23_11
+; RV32ZBB-NEXT: .LBB23_9:
+; RV32ZBB-NEXT: sll t3, a0, t2
+; RV32ZBB-NEXT: srl a0, a0, t5
+; RV32ZBB-NEXT: sll t6, a1, t2
+; RV32ZBB-NEXT: or a0, a0, t6
+; RV32ZBB-NEXT: beqz t1, .LBB23_11
+; RV32ZBB-NEXT: .LBB23_10:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB23_11:
+; RV32ZBB-NEXT: bltu t0, a6, .LBB23_13
+; RV32ZBB-NEXT: # %bb.12:
+; RV32ZBB-NEXT: srl t4, a3, t0
+; RV32ZBB-NEXT: mv a0, a2
+; RV32ZBB-NEXT: bnez t0, .LBB23_14
+; RV32ZBB-NEXT: j .LBB23_15
+; RV32ZBB-NEXT: .LBB23_13:
+; RV32ZBB-NEXT: srl a0, a2, a4
+; RV32ZBB-NEXT: sll t4, a3, t4
+; RV32ZBB-NEXT: or t4, a0, t4
+; RV32ZBB-NEXT: mv a0, a2
+; RV32ZBB-NEXT: beqz t0, .LBB23_15
+; RV32ZBB-NEXT: .LBB23_14:
+; RV32ZBB-NEXT: mv a0, t4
+; RV32ZBB-NEXT: .LBB23_15:
+; RV32ZBB-NEXT: bltu t0, a6, .LBB23_18
+; RV32ZBB-NEXT: # %bb.16:
+; RV32ZBB-NEXT: li a4, 0
+; RV32ZBB-NEXT: bgeu t1, a6, .LBB23_19
+; RV32ZBB-NEXT: .LBB23_17:
+; RV32ZBB-NEXT: sll a6, a2, t2
+; RV32ZBB-NEXT: srl a2, a2, t5
+; RV32ZBB-NEXT: sll t0, a3, t2
+; RV32ZBB-NEXT: or a2, a2, t0
+; RV32ZBB-NEXT: j .LBB23_20
+; RV32ZBB-NEXT: .LBB23_18:
+; RV32ZBB-NEXT: srl a4, a3, a4
+; RV32ZBB-NEXT: bltu t1, a6, .LBB23_17
+; RV32ZBB-NEXT: .LBB23_19:
+; RV32ZBB-NEXT: li a6, 0
+; RV32ZBB-NEXT: sll a2, a2, t1
+; RV32ZBB-NEXT: .LBB23_20:
+; RV32ZBB-NEXT: or a5, a5, t3
+; RV32ZBB-NEXT: or a1, a7, a1
+; RV32ZBB-NEXT: beqz t1, .LBB23_22
+; RV32ZBB-NEXT: # %bb.21:
+; RV32ZBB-NEXT: mv a3, a2
+; RV32ZBB-NEXT: .LBB23_22:
+; RV32ZBB-NEXT: or a2, a0, a6
+; RV32ZBB-NEXT: or a3, a4, a3
+; RV32ZBB-NEXT: add a0, a5, a2
+; RV32ZBB-NEXT: sltu a2, a0, a2
+; RV32ZBB-NEXT: add a1, a1, a3
+; RV32ZBB-NEXT: add a1, a1, a2
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_mask_multiple:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: ror a0, a0, a2
+; RV64ZBB-NEXT: ror a1, a1, a2
+; RV64ZBB-NEXT: add a0, a0, a1
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_mask_multiple:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: andi t0, a4, 63
+; RV32XTHEADBB-NEXT: li a6, 32
+; RV32XTHEADBB-NEXT: neg t4, t0
+; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a7, a1, t0
+; RV32XTHEADBB-NEXT: mv a5, a0
+; RV32XTHEADBB-NEXT: bnez t0, .LBB23_3
+; RV32XTHEADBB-NEXT: j .LBB23_4
+; RV32XTHEADBB-NEXT: .LBB23_2:
+; RV32XTHEADBB-NEXT: srl a5, a0, a4
+; RV32XTHEADBB-NEXT: sll a7, a1, t4
+; RV32XTHEADBB-NEXT: or a7, a5, a7
+; RV32XTHEADBB-NEXT: mv a5, a0
+; RV32XTHEADBB-NEXT: beqz t0, .LBB23_4
+; RV32XTHEADBB-NEXT: .LBB23_3:
+; RV32XTHEADBB-NEXT: mv a5, a7
+; RV32XTHEADBB-NEXT: .LBB23_4:
+; RV32XTHEADBB-NEXT: neg t2, t0
+; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_6
+; RV32XTHEADBB-NEXT: # %bb.5:
+; RV32XTHEADBB-NEXT: li a7, 0
+; RV32XTHEADBB-NEXT: j .LBB23_7
+; RV32XTHEADBB-NEXT: .LBB23_6:
+; RV32XTHEADBB-NEXT: srl a7, a1, a4
+; RV32XTHEADBB-NEXT: .LBB23_7:
+; RV32XTHEADBB-NEXT: andi t1, t2, 63
+; RV32XTHEADBB-NEXT: neg t5, t1
+; RV32XTHEADBB-NEXT: bltu t1, a6, .LBB23_9
+; RV32XTHEADBB-NEXT: # %bb.8:
+; RV32XTHEADBB-NEXT: li t3, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, t1
+; RV32XTHEADBB-NEXT: bnez t1, .LBB23_10
+; RV32XTHEADBB-NEXT: j .LBB23_11
+; RV32XTHEADBB-NEXT: .LBB23_9:
+; RV32XTHEADBB-NEXT: sll t3, a0, t2
+; RV32XTHEADBB-NEXT: srl a0, a0, t5
+; RV32XTHEADBB-NEXT: sll t6, a1, t2
+; RV32XTHEADBB-NEXT: or a0, a0, t6
+; RV32XTHEADBB-NEXT: beqz t1, .LBB23_11
+; RV32XTHEADBB-NEXT: .LBB23_10:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB23_11:
+; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_13
+; RV32XTHEADBB-NEXT: # %bb.12:
+; RV32XTHEADBB-NEXT: srl t4, a3, t0
+; RV32XTHEADBB-NEXT: mv a0, a2
+; RV32XTHEADBB-NEXT: bnez t0, .LBB23_14
+; RV32XTHEADBB-NEXT: j .LBB23_15
+; RV32XTHEADBB-NEXT: .LBB23_13:
+; RV32XTHEADBB-NEXT: srl a0, a2, a4
+; RV32XTHEADBB-NEXT: sll t4, a3, t4
+; RV32XTHEADBB-NEXT: or t4, a0, t4
+; RV32XTHEADBB-NEXT: mv a0, a2
+; RV32XTHEADBB-NEXT: beqz t0, .LBB23_15
+; RV32XTHEADBB-NEXT: .LBB23_14:
+; RV32XTHEADBB-NEXT: mv a0, t4
+; RV32XTHEADBB-NEXT: .LBB23_15:
+; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_18
+; RV32XTHEADBB-NEXT: # %bb.16:
+; RV32XTHEADBB-NEXT: li a4, 0
+; RV32XTHEADBB-NEXT: bgeu t1, a6, .LBB23_19
+; RV32XTHEADBB-NEXT: .LBB23_17:
+; RV32XTHEADBB-NEXT: sll a6, a2, t2
+; RV32XTHEADBB-NEXT: srl a2, a2, t5
+; RV32XTHEADBB-NEXT: sll t0, a3, t2
+; RV32XTHEADBB-NEXT: or a2, a2, t0
+; RV32XTHEADBB-NEXT: j .LBB23_20
+; RV32XTHEADBB-NEXT: .LBB23_18:
+; RV32XTHEADBB-NEXT: srl a4, a3, a4
+; RV32XTHEADBB-NEXT: bltu t1, a6, .LBB23_17
+; RV32XTHEADBB-NEXT: .LBB23_19:
+; RV32XTHEADBB-NEXT: li a6, 0
+; RV32XTHEADBB-NEXT: sll a2, a2, t1
+; RV32XTHEADBB-NEXT: .LBB23_20:
+; RV32XTHEADBB-NEXT: or a5, a5, t3
+; RV32XTHEADBB-NEXT: or a1, a7, a1
+; RV32XTHEADBB-NEXT: beqz t1, .LBB23_22
+; RV32XTHEADBB-NEXT: # %bb.21:
+; RV32XTHEADBB-NEXT: mv a3, a2
+; RV32XTHEADBB-NEXT: .LBB23_22:
+; RV32XTHEADBB-NEXT: or a2, a0, a6
+; RV32XTHEADBB-NEXT: or a3, a4, a3
+; RV32XTHEADBB-NEXT: add a0, a5, a2
+; RV32XTHEADBB-NEXT: sltu a2, a0, a2
+; RV32XTHEADBB-NEXT: add a1, a1, a3
+; RV32XTHEADBB-NEXT: add a1, a1, a2
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_mask_multiple:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: andi a3, a2, 63
+; RV64XTHEADBB-NEXT: srl a4, a0, a2
+; RV64XTHEADBB-NEXT: srl a2, a1, a2
+; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: sll a0, a0, a3
+; RV64XTHEADBB-NEXT: sll a1, a1, a3
+; RV64XTHEADBB-NEXT: or a0, a4, a0
+; RV64XTHEADBB-NEXT: or a1, a2, a1
+; RV64XTHEADBB-NEXT: add a0, a0, a1
+; RV64XTHEADBB-NEXT: ret
+ %maskedamt = and i64 %amt, 63
+ %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %maskedamt)
+ %2 = tail call i64 @llvm.fshr.i64(i64 %b, i64 %b, i64 %maskedamt)
+ %3 = add i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotl_64_zext:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a6, 64
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: srl a7, a0, a5
+; RV32I-NEXT: bltu a2, a4, .LBB24_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: sll t1, a0, a2
+; RV32I-NEXT: j .LBB24_3
+; RV32I-NEXT: .LBB24_2:
+; RV32I-NEXT: sll a3, a0, a2
+; RV32I-NEXT: sll t0, a1, a2
+; RV32I-NEXT: or t1, a7, t0
+; RV32I-NEXT: .LBB24_3:
+; RV32I-NEXT: sub t0, a6, a2
+; RV32I-NEXT: mv a6, a1
+; RV32I-NEXT: beqz a2, .LBB24_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: .LBB24_5:
+; RV32I-NEXT: bltu t0, a4, .LBB24_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl a2, a1, t0
+; RV32I-NEXT: bnez t0, .LBB24_8
+; RV32I-NEXT: j .LBB24_9
+; RV32I-NEXT: .LBB24_7:
+; RV32I-NEXT: neg a2, t0
+; RV32I-NEXT: sll a2, a1, a2
+; RV32I-NEXT: or a2, a7, a2
+; RV32I-NEXT: beqz t0, .LBB24_9
+; RV32I-NEXT: .LBB24_8:
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: .LBB24_9:
+; RV32I-NEXT: bltu t0, a4, .LBB24_11
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB24_12
+; RV32I-NEXT: .LBB24_11:
+; RV32I-NEXT: srl a1, a1, a5
+; RV32I-NEXT: .LBB24_12:
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotl_64_zext:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 64
+; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: sll a1, a0, a1
+; RV64I-NEXT: srl a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotl_64_zext:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: li a6, 64
+; RV32ZBB-NEXT: li a4, 32
+; RV32ZBB-NEXT: neg a5, a2
+; RV32ZBB-NEXT: srl a7, a0, a5
+; RV32ZBB-NEXT: bltu a2, a4, .LBB24_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: li a3, 0
+; RV32ZBB-NEXT: sll t1, a0, a2
+; RV32ZBB-NEXT: j .LBB24_3
+; RV32ZBB-NEXT: .LBB24_2:
+; RV32ZBB-NEXT: sll a3, a0, a2
+; RV32ZBB-NEXT: sll t0, a1, a2
+; RV32ZBB-NEXT: or t1, a7, t0
+; RV32ZBB-NEXT: .LBB24_3:
+; RV32ZBB-NEXT: sub t0, a6, a2
+; RV32ZBB-NEXT: mv a6, a1
+; RV32ZBB-NEXT: beqz a2, .LBB24_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a6, t1
+; RV32ZBB-NEXT: .LBB24_5:
+; RV32ZBB-NEXT: bltu t0, a4, .LBB24_7
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: srl a2, a1, t0
+; RV32ZBB-NEXT: bnez t0, .LBB24_8
+; RV32ZBB-NEXT: j .LBB24_9
+; RV32ZBB-NEXT: .LBB24_7:
+; RV32ZBB-NEXT: neg a2, t0
+; RV32ZBB-NEXT: sll a2, a1, a2
+; RV32ZBB-NEXT: or a2, a7, a2
+; RV32ZBB-NEXT: beqz t0, .LBB24_9
+; RV32ZBB-NEXT: .LBB24_8:
+; RV32ZBB-NEXT: mv a0, a2
+; RV32ZBB-NEXT: .LBB24_9:
+; RV32ZBB-NEXT: bltu t0, a4, .LBB24_11
+; RV32ZBB-NEXT: # %bb.10:
+; RV32ZBB-NEXT: li a1, 0
+; RV32ZBB-NEXT: j .LBB24_12
+; RV32ZBB-NEXT: .LBB24_11:
+; RV32ZBB-NEXT: srl a1, a1, a5
+; RV32ZBB-NEXT: .LBB24_12:
+; RV32ZBB-NEXT: or a0, a3, a0
+; RV32ZBB-NEXT: or a1, a6, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotl_64_zext:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: li a2, 64
+; RV64ZBB-NEXT: sub a2, a2, a1
+; RV64ZBB-NEXT: sll a1, a0, a1
+; RV64ZBB-NEXT: srl a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotl_64_zext:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: li a6, 64
+; RV32XTHEADBB-NEXT: li a4, 32
+; RV32XTHEADBB-NEXT: neg a5, a2
+; RV32XTHEADBB-NEXT: srl a7, a0, a5
+; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB24_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: li a3, 0
+; RV32XTHEADBB-NEXT: sll t1, a0, a2
+; RV32XTHEADBB-NEXT: j .LBB24_3
+; RV32XTHEADBB-NEXT: .LBB24_2:
+; RV32XTHEADBB-NEXT: sll a3, a0, a2
+; RV32XTHEADBB-NEXT: sll t0, a1, a2
+; RV32XTHEADBB-NEXT: or t1, a7, t0
+; RV32XTHEADBB-NEXT: .LBB24_3:
+; RV32XTHEADBB-NEXT: sub t0, a6, a2
+; RV32XTHEADBB-NEXT: mv a6, a1
+; RV32XTHEADBB-NEXT: beqz a2, .LBB24_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a6, t1
+; RV32XTHEADBB-NEXT: .LBB24_5:
+; RV32XTHEADBB-NEXT: bltu t0, a4, .LBB24_7
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: srl a2, a1, t0
+; RV32XTHEADBB-NEXT: bnez t0, .LBB24_8
+; RV32XTHEADBB-NEXT: j .LBB24_9
+; RV32XTHEADBB-NEXT: .LBB24_7:
+; RV32XTHEADBB-NEXT: neg a2, t0
+; RV32XTHEADBB-NEXT: sll a2, a1, a2
+; RV32XTHEADBB-NEXT: or a2, a7, a2
+; RV32XTHEADBB-NEXT: beqz t0, .LBB24_9
+; RV32XTHEADBB-NEXT: .LBB24_8:
+; RV32XTHEADBB-NEXT: mv a0, a2
+; RV32XTHEADBB-NEXT: .LBB24_9:
+; RV32XTHEADBB-NEXT: bltu t0, a4, .LBB24_11
+; RV32XTHEADBB-NEXT: # %bb.10:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: j .LBB24_12
+; RV32XTHEADBB-NEXT: .LBB24_11:
+; RV32XTHEADBB-NEXT: srl a1, a1, a5
+; RV32XTHEADBB-NEXT: .LBB24_12:
+; RV32XTHEADBB-NEXT: or a0, a3, a0
+; RV32XTHEADBB-NEXT: or a1, a6, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotl_64_zext:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: li a2, 64
+; RV64XTHEADBB-NEXT: sub a2, a2, a1
+; RV64XTHEADBB-NEXT: sll a1, a0, a1
+; RV64XTHEADBB-NEXT: srl a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 64, %y
+ %zext = zext i32 %z to i64
+ %zexty = zext i32 %y to i64
+ %b = shl i64 %x, %zexty
+ %c = lshr i64 %x, %zext
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
+; RV32I-LABEL: rotr_64_zext:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a5, 32
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: sll a4, a1, a6
+; RV32I-NEXT: bltu a2, a5, .LBB25_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, a1, a2
+; RV32I-NEXT: j .LBB25_3
+; RV32I-NEXT: .LBB25_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: or a7, a3, a4
+; RV32I-NEXT: .LBB25_3:
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a2, .LBB25_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB25_5:
+; RV32I-NEXT: sub a7, t0, a2
+; RV32I-NEXT: bltu a2, a5, .LBB25_8
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: bgeu a7, a5, .LBB25_9
+; RV32I-NEXT: .LBB25_7:
+; RV32I-NEXT: sll a5, a0, a6
+; RV32I-NEXT: neg a6, a7
+; RV32I-NEXT: srl a0, a0, a6
+; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: bnez a7, .LBB25_10
+; RV32I-NEXT: j .LBB25_11
+; RV32I-NEXT: .LBB25_8:
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: bltu a7, a5, .LBB25_7
+; RV32I-NEXT: .LBB25_9:
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: sll a0, a0, a7
+; RV32I-NEXT: beqz a7, .LBB25_11
+; RV32I-NEXT: .LBB25_10:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB25_11:
+; RV32I-NEXT: or a0, a3, a5
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: rotr_64_zext:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 64
+; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: srl a1, a0, a1
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV32ZBB-LABEL: rotr_64_zext:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: li a5, 32
+; RV32ZBB-NEXT: neg a6, a2
+; RV32ZBB-NEXT: sll a4, a1, a6
+; RV32ZBB-NEXT: bltu a2, a5, .LBB25_2
+; RV32ZBB-NEXT: # %bb.1:
+; RV32ZBB-NEXT: srl a7, a1, a2
+; RV32ZBB-NEXT: j .LBB25_3
+; RV32ZBB-NEXT: .LBB25_2:
+; RV32ZBB-NEXT: srl a3, a0, a2
+; RV32ZBB-NEXT: or a7, a3, a4
+; RV32ZBB-NEXT: .LBB25_3:
+; RV32ZBB-NEXT: li t0, 64
+; RV32ZBB-NEXT: mv a3, a0
+; RV32ZBB-NEXT: beqz a2, .LBB25_5
+; RV32ZBB-NEXT: # %bb.4:
+; RV32ZBB-NEXT: mv a3, a7
+; RV32ZBB-NEXT: .LBB25_5:
+; RV32ZBB-NEXT: sub a7, t0, a2
+; RV32ZBB-NEXT: bltu a2, a5, .LBB25_8
+; RV32ZBB-NEXT: # %bb.6:
+; RV32ZBB-NEXT: li a2, 0
+; RV32ZBB-NEXT: bgeu a7, a5, .LBB25_9
+; RV32ZBB-NEXT: .LBB25_7:
+; RV32ZBB-NEXT: sll a5, a0, a6
+; RV32ZBB-NEXT: neg a6, a7
+; RV32ZBB-NEXT: srl a0, a0, a6
+; RV32ZBB-NEXT: or a0, a0, a4
+; RV32ZBB-NEXT: bnez a7, .LBB25_10
+; RV32ZBB-NEXT: j .LBB25_11
+; RV32ZBB-NEXT: .LBB25_8:
+; RV32ZBB-NEXT: srl a2, a1, a2
+; RV32ZBB-NEXT: bltu a7, a5, .LBB25_7
+; RV32ZBB-NEXT: .LBB25_9:
+; RV32ZBB-NEXT: li a5, 0
+; RV32ZBB-NEXT: sll a0, a0, a7
+; RV32ZBB-NEXT: beqz a7, .LBB25_11
+; RV32ZBB-NEXT: .LBB25_10:
+; RV32ZBB-NEXT: mv a1, a0
+; RV32ZBB-NEXT: .LBB25_11:
+; RV32ZBB-NEXT: or a0, a3, a5
+; RV32ZBB-NEXT: or a1, a2, a1
+; RV32ZBB-NEXT: ret
+;
+; RV64ZBB-LABEL: rotr_64_zext:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: li a2, 64
+; RV64ZBB-NEXT: sub a2, a2, a1
+; RV64ZBB-NEXT: srl a1, a0, a1
+; RV64ZBB-NEXT: sll a0, a0, a2
+; RV64ZBB-NEXT: or a0, a1, a0
+; RV64ZBB-NEXT: ret
+;
+; RV32XTHEADBB-LABEL: rotr_64_zext:
+; RV32XTHEADBB: # %bb.0:
+; RV32XTHEADBB-NEXT: li a5, 32
+; RV32XTHEADBB-NEXT: neg a6, a2
+; RV32XTHEADBB-NEXT: sll a4, a1, a6
+; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB25_2
+; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: srl a7, a1, a2
+; RV32XTHEADBB-NEXT: j .LBB25_3
+; RV32XTHEADBB-NEXT: .LBB25_2:
+; RV32XTHEADBB-NEXT: srl a3, a0, a2
+; RV32XTHEADBB-NEXT: or a7, a3, a4
+; RV32XTHEADBB-NEXT: .LBB25_3:
+; RV32XTHEADBB-NEXT: li t0, 64
+; RV32XTHEADBB-NEXT: mv a3, a0
+; RV32XTHEADBB-NEXT: beqz a2, .LBB25_5
+; RV32XTHEADBB-NEXT: # %bb.4:
+; RV32XTHEADBB-NEXT: mv a3, a7
+; RV32XTHEADBB-NEXT: .LBB25_5:
+; RV32XTHEADBB-NEXT: sub a7, t0, a2
+; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB25_8
+; RV32XTHEADBB-NEXT: # %bb.6:
+; RV32XTHEADBB-NEXT: li a2, 0
+; RV32XTHEADBB-NEXT: bgeu a7, a5, .LBB25_9
+; RV32XTHEADBB-NEXT: .LBB25_7:
+; RV32XTHEADBB-NEXT: sll a5, a0, a6
+; RV32XTHEADBB-NEXT: neg a6, a7
+; RV32XTHEADBB-NEXT: srl a0, a0, a6
+; RV32XTHEADBB-NEXT: or a0, a0, a4
+; RV32XTHEADBB-NEXT: bnez a7, .LBB25_10
+; RV32XTHEADBB-NEXT: j .LBB25_11
+; RV32XTHEADBB-NEXT: .LBB25_8:
+; RV32XTHEADBB-NEXT: srl a2, a1, a2
+; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB25_7
+; RV32XTHEADBB-NEXT: .LBB25_9:
+; RV32XTHEADBB-NEXT: li a5, 0
+; RV32XTHEADBB-NEXT: sll a0, a0, a7
+; RV32XTHEADBB-NEXT: beqz a7, .LBB25_11
+; RV32XTHEADBB-NEXT: .LBB25_10:
+; RV32XTHEADBB-NEXT: mv a1, a0
+; RV32XTHEADBB-NEXT: .LBB25_11:
+; RV32XTHEADBB-NEXT: or a0, a3, a5
+; RV32XTHEADBB-NEXT: or a1, a2, a1
+; RV32XTHEADBB-NEXT: ret
+;
+; RV64XTHEADBB-LABEL: rotr_64_zext:
+; RV64XTHEADBB: # %bb.0:
+; RV64XTHEADBB-NEXT: li a2, 64
+; RV64XTHEADBB-NEXT: sub a2, a2, a1
+; RV64XTHEADBB-NEXT: srl a1, a0, a1
+; RV64XTHEADBB-NEXT: sll a0, a0, a2
+; RV64XTHEADBB-NEXT: or a0, a1, a0
+; RV64XTHEADBB-NEXT: ret
+ %z = sub i32 64, %y
+ %zext = zext i32 %z to i64
+ %zexty = zext i32 %y to i64
+ %b = lshr i64 %x, %zexty
+ %c = shl i64 %x, %zext
+ %d = or i64 %b, %c
+ ret i64 %d
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
new file mode 100644
index 00000000000000..71a5ecc77a1b08
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
@@ -0,0 +1,962 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -global-isel < %s \
+; RUN: | FileCheck %s -check-prefix=RV64I
+
+; Basic shift support is tested as part of ALU.ll. This file ensures that
+; shifts which may not be supported natively are lowered properly.
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare i128 @llvm.fshr.i128(i128, i128, i128)
+
+define i64 @lshr64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: lshr64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB0_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB0_3
+; RV32I-NEXT: j .LBB0_4
+; RV32I-NEXT: .LBB0_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB0_4
+; RV32I-NEXT: .LBB0_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB0_4:
+; RV32I-NEXT: bltu a2, a3, .LBB0_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB0_6:
+; RV32I-NEXT: srl a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: lshr64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = lshr i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @lshr64_minsize(i64 %a, i64 %b) minsize nounwind {
+; RV32I-LABEL: lshr64_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB1_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB1_3
+; RV32I-NEXT: j .LBB1_4
+; RV32I-NEXT: .LBB1_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB1_4
+; RV32I-NEXT: .LBB1_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB1_4:
+; RV32I-NEXT: bltu a2, a3, .LBB1_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB1_6:
+; RV32I-NEXT: srl a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: lshr64_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = lshr i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @ashr64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: ashr64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB2_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB2_3
+; RV32I-NEXT: j .LBB2_4
+; RV32I-NEXT: .LBB2_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB2_4
+; RV32I-NEXT: .LBB2_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB2_4:
+; RV32I-NEXT: bltu a2, a3, .LBB2_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srai a1, a1, 31
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB2_6:
+; RV32I-NEXT: sra a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ashr64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sra a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = ashr i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @ashr64_minsize(i64 %a, i64 %b) minsize nounwind {
+; RV32I-LABEL: ashr64_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 32
+; RV32I-NEXT: bltu a2, a3, .LBB3_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra a4, a1, a2
+; RV32I-NEXT: bnez a2, .LBB3_3
+; RV32I-NEXT: j .LBB3_4
+; RV32I-NEXT: .LBB3_2:
+; RV32I-NEXT: srl a4, a0, a2
+; RV32I-NEXT: neg a5, a2
+; RV32I-NEXT: sll a5, a1, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a2, .LBB3_4
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: bltu a2, a3, .LBB3_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srai a1, a1, 31
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB3_6:
+; RV32I-NEXT: sra a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ashr64_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sra a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = ashr i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @shl64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: shl64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: bltu a2, a0, .LBB4_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: bnez a2, .LBB4_3
+; RV32I-NEXT: j .LBB4_4
+; RV32I-NEXT: .LBB4_2:
+; RV32I-NEXT: sll a0, a3, a2
+; RV32I-NEXT: neg a4, a2
+; RV32I-NEXT: srl a3, a3, a4
+; RV32I-NEXT: sll a4, a1, a2
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: beqz a2, .LBB4_4
+; RV32I-NEXT: .LBB4_3:
+; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: .LBB4_4:
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: shl64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = shl i64 %a, %b
+ ret i64 %1
+}
+
+define i64 @shl64_minsize(i64 %a, i64 %b) minsize nounwind {
+; RV32I-LABEL: shl64_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: bltu a2, a0, .LBB5_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: bnez a2, .LBB5_3
+; RV32I-NEXT: j .LBB5_4
+; RV32I-NEXT: .LBB5_2:
+; RV32I-NEXT: sll a0, a3, a2
+; RV32I-NEXT: neg a4, a2
+; RV32I-NEXT: srl a3, a3, a4
+; RV32I-NEXT: sll a4, a1, a2
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: beqz a2, .LBB5_4
+; RV32I-NEXT: .LBB5_3:
+; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: .LBB5_4:
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: shl64_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: ret
+ %1 = shl i64 %a, %b
+ ret i64 %1
+}
+
+define i128 @lshr128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: lshr128:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a2, 0(a2)
+; RV32I-NEXT: lw a3, 8(a1)
+; RV32I-NEXT: lw a7, 12(a1)
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: srl t2, a3, a2
+; RV32I-NEXT: neg t6, a2
+; RV32I-NEXT: sll t5, a7, t6
+; RV32I-NEXT: bltu a2, t0, .LBB6_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a5, a7, a2
+; RV32I-NEXT: mv a4, a3
+; RV32I-NEXT: bnez a2, .LBB6_3
+; RV32I-NEXT: j .LBB6_4
+; RV32I-NEXT: .LBB6_2:
+; RV32I-NEXT: or a5, t2, t5
+; RV32I-NEXT: mv a4, a3
+; RV32I-NEXT: beqz a2, .LBB6_4
+; RV32I-NEXT: .LBB6_3:
+; RV32I-NEXT: mv a4, a5
+; RV32I-NEXT: .LBB6_4:
+; RV32I-NEXT: lw a5, 0(a1)
+; RV32I-NEXT: lw a1, 4(a1)
+; RV32I-NEXT: bltu a2, t0, .LBB6_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: srl t4, a1, a2
+; RV32I-NEXT: j .LBB6_7
+; RV32I-NEXT: .LBB6_6:
+; RV32I-NEXT: srl a6, a7, a2
+; RV32I-NEXT: srl t1, a5, a2
+; RV32I-NEXT: sll t3, a1, t6
+; RV32I-NEXT: or t4, t1, t3
+; RV32I-NEXT: .LBB6_7:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a5
+; RV32I-NEXT: beqz a2, .LBB6_9
+; RV32I-NEXT: # %bb.8:
+; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: .LBB6_9:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s0, t1, a2
+; RV32I-NEXT: bltu a2, t0, .LBB6_12
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: bgeu s0, t0, .LBB6_13
+; RV32I-NEXT: .LBB6_11:
+; RV32I-NEXT: sll t6, a3, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a3, s1
+; RV32I-NEXT: or s2, s1, t5
+; RV32I-NEXT: j .LBB6_14
+; RV32I-NEXT: .LBB6_12:
+; RV32I-NEXT: srl t4, a1, a2
+; RV32I-NEXT: bltu s0, t0, .LBB6_11
+; RV32I-NEXT: .LBB6_13:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a3, s0
+; RV32I-NEXT: .LBB6_14:
+; RV32I-NEXT: addi s1, a2, -64
+; RV32I-NEXT: mv t5, a7
+; RV32I-NEXT: beqz s0, .LBB6_16
+; RV32I-NEXT: # %bb.15:
+; RV32I-NEXT: mv t5, s2
+; RV32I-NEXT: .LBB6_16:
+; RV32I-NEXT: bltu s1, t0, .LBB6_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: srl t2, a7, s1
+; RV32I-NEXT: bnez s1, .LBB6_19
+; RV32I-NEXT: j .LBB6_20
+; RV32I-NEXT: .LBB6_18:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, a7, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB6_20
+; RV32I-NEXT: .LBB6_19:
+; RV32I-NEXT: mv a3, t2
+; RV32I-NEXT: .LBB6_20:
+; RV32I-NEXT: bltu s1, t0, .LBB6_22
+; RV32I-NEXT: # %bb.21:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: bltu a2, t1, .LBB6_23
+; RV32I-NEXT: j .LBB6_24
+; RV32I-NEXT: .LBB6_22:
+; RV32I-NEXT: srl a7, a7, a2
+; RV32I-NEXT: bgeu a2, t1, .LBB6_24
+; RV32I-NEXT: .LBB6_23:
+; RV32I-NEXT: or a3, t3, t6
+; RV32I-NEXT: or a7, t4, t5
+; RV32I-NEXT: .LBB6_24:
+; RV32I-NEXT: bnez a2, .LBB6_28
+; RV32I-NEXT: # %bb.25:
+; RV32I-NEXT: bltu a2, t1, .LBB6_27
+; RV32I-NEXT: .LBB6_26:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: .LBB6_27:
+; RV32I-NEXT: sw a5, 0(a0)
+; RV32I-NEXT: sw a1, 4(a0)
+; RV32I-NEXT: sw a4, 8(a0)
+; RV32I-NEXT: sw a6, 12(a0)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB6_28:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: mv a1, a7
+; RV32I-NEXT: bgeu a2, t1, .LBB6_26
+; RV32I-NEXT: j .LBB6_27
+;
+; RV64I-LABEL: lshr128:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a3, 64
+; RV64I-NEXT: bltu a2, a3, .LBB6_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a4, a2, a3
+; RV64I-NEXT: srl a4, a1, a4
+; RV64I-NEXT: bnez a2, .LBB6_3
+; RV64I-NEXT: j .LBB6_4
+; RV64I-NEXT: .LBB6_2:
+; RV64I-NEXT: srl a4, a0, a2
+; RV64I-NEXT: negw a5, a2
+; RV64I-NEXT: sll a5, a1, a5
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: beqz a2, .LBB6_4
+; RV64I-NEXT: .LBB6_3:
+; RV64I-NEXT: mv a0, a4
+; RV64I-NEXT: .LBB6_4:
+; RV64I-NEXT: bltu a2, a3, .LBB6_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB6_6:
+; RV64I-NEXT: srl a1, a1, a2
+; RV64I-NEXT: ret
+ %1 = lshr i128 %a, %b
+ ret i128 %1
+}
+
+define i128 @ashr128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: ashr128:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a2, 0(a2)
+; RV32I-NEXT: lw a4, 8(a1)
+; RV32I-NEXT: lw a3, 12(a1)
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: srl t2, a4, a2
+; RV32I-NEXT: neg t6, a2
+; RV32I-NEXT: sll t5, a3, t6
+; RV32I-NEXT: bltu a2, t0, .LBB7_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra a6, a3, a2
+; RV32I-NEXT: mv a5, a4
+; RV32I-NEXT: bnez a2, .LBB7_3
+; RV32I-NEXT: j .LBB7_4
+; RV32I-NEXT: .LBB7_2:
+; RV32I-NEXT: or a6, t2, t5
+; RV32I-NEXT: mv a5, a4
+; RV32I-NEXT: beqz a2, .LBB7_4
+; RV32I-NEXT: .LBB7_3:
+; RV32I-NEXT: mv a5, a6
+; RV32I-NEXT: .LBB7_4:
+; RV32I-NEXT: lw a6, 0(a1)
+; RV32I-NEXT: lw a1, 4(a1)
+; RV32I-NEXT: bltu a2, t0, .LBB7_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srai a7, a3, 31
+; RV32I-NEXT: srl t4, a1, a2
+; RV32I-NEXT: j .LBB7_7
+; RV32I-NEXT: .LBB7_6:
+; RV32I-NEXT: sra a7, a3, a2
+; RV32I-NEXT: srl t1, a6, a2
+; RV32I-NEXT: sll t3, a1, t6
+; RV32I-NEXT: or t4, t1, t3
+; RV32I-NEXT: .LBB7_7:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a6
+; RV32I-NEXT: beqz a2, .LBB7_9
+; RV32I-NEXT: # %bb.8:
+; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: .LBB7_9:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s0, t1, a2
+; RV32I-NEXT: bltu a2, t0, .LBB7_12
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: bgeu s0, t0, .LBB7_13
+; RV32I-NEXT: .LBB7_11:
+; RV32I-NEXT: sll t6, a4, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a4, s1
+; RV32I-NEXT: or s2, s1, t5
+; RV32I-NEXT: j .LBB7_14
+; RV32I-NEXT: .LBB7_12:
+; RV32I-NEXT: srl t4, a1, a2
+; RV32I-NEXT: bltu s0, t0, .LBB7_11
+; RV32I-NEXT: .LBB7_13:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a4, s0
+; RV32I-NEXT: .LBB7_14:
+; RV32I-NEXT: addi s1, a2, -64
+; RV32I-NEXT: mv t5, a3
+; RV32I-NEXT: beqz s0, .LBB7_16
+; RV32I-NEXT: # %bb.15:
+; RV32I-NEXT: mv t5, s2
+; RV32I-NEXT: .LBB7_16:
+; RV32I-NEXT: bltu s1, t0, .LBB7_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: sra t2, a3, s1
+; RV32I-NEXT: bnez s1, .LBB7_19
+; RV32I-NEXT: j .LBB7_20
+; RV32I-NEXT: .LBB7_18:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, a3, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB7_20
+; RV32I-NEXT: .LBB7_19:
+; RV32I-NEXT: mv a4, t2
+; RV32I-NEXT: .LBB7_20:
+; RV32I-NEXT: bltu s1, t0, .LBB7_22
+; RV32I-NEXT: # %bb.21:
+; RV32I-NEXT: srai t0, a3, 31
+; RV32I-NEXT: bltu a2, t1, .LBB7_23
+; RV32I-NEXT: j .LBB7_24
+; RV32I-NEXT: .LBB7_22:
+; RV32I-NEXT: sra t0, a3, a2
+; RV32I-NEXT: bgeu a2, t1, .LBB7_24
+; RV32I-NEXT: .LBB7_23:
+; RV32I-NEXT: or a4, t3, t6
+; RV32I-NEXT: or t0, t4, t5
+; RV32I-NEXT: .LBB7_24:
+; RV32I-NEXT: bnez a2, .LBB7_28
+; RV32I-NEXT: # %bb.25:
+; RV32I-NEXT: bltu a2, t1, .LBB7_27
+; RV32I-NEXT: .LBB7_26:
+; RV32I-NEXT: srai a5, a3, 31
+; RV32I-NEXT: mv a7, a5
+; RV32I-NEXT: .LBB7_27:
+; RV32I-NEXT: sw a6, 0(a0)
+; RV32I-NEXT: sw a1, 4(a0)
+; RV32I-NEXT: sw a5, 8(a0)
+; RV32I-NEXT: sw a7, 12(a0)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB7_28:
+; RV32I-NEXT: mv a6, a4
+; RV32I-NEXT: mv a1, t0
+; RV32I-NEXT: bgeu a2, t1, .LBB7_26
+; RV32I-NEXT: j .LBB7_27
+;
+; RV64I-LABEL: ashr128:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a3, 64
+; RV64I-NEXT: bltu a2, a3, .LBB7_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a4, a2, a3
+; RV64I-NEXT: sra a4, a1, a4
+; RV64I-NEXT: bnez a2, .LBB7_3
+; RV64I-NEXT: j .LBB7_4
+; RV64I-NEXT: .LBB7_2:
+; RV64I-NEXT: srl a4, a0, a2
+; RV64I-NEXT: negw a5, a2
+; RV64I-NEXT: sll a5, a1, a5
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: beqz a2, .LBB7_4
+; RV64I-NEXT: .LBB7_3:
+; RV64I-NEXT: mv a0, a4
+; RV64I-NEXT: .LBB7_4:
+; RV64I-NEXT: bltu a2, a3, .LBB7_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: srai a1, a1, 63
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB7_6:
+; RV64I-NEXT: sra a1, a1, a2
+; RV64I-NEXT: ret
+ %1 = ashr i128 %a, %b
+ ret i128 %1
+}
+
+define i128 @shl128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: shl128:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a2, 0(a2)
+; RV32I-NEXT: lw a7, 0(a1)
+; RV32I-NEXT: lw a3, 4(a1)
+; RV32I-NEXT: li a6, 64
+; RV32I-NEXT: li t1, 32
+; RV32I-NEXT: neg t5, a2
+; RV32I-NEXT: srl t2, a7, t5
+; RV32I-NEXT: sll t0, a3, a2
+; RV32I-NEXT: bltu a2, t1, .LBB8_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: sll t3, a7, a2
+; RV32I-NEXT: j .LBB8_3
+; RV32I-NEXT: .LBB8_2:
+; RV32I-NEXT: sll a4, a7, a2
+; RV32I-NEXT: or t3, t2, t0
+; RV32I-NEXT: .LBB8_3:
+; RV32I-NEXT: sub t4, a6, a2
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: beqz a2, .LBB8_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a5, t3
+; RV32I-NEXT: .LBB8_5:
+; RV32I-NEXT: bltu t4, t1, .LBB8_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t2, a3, t4
+; RV32I-NEXT: mv t3, a7
+; RV32I-NEXT: bnez t4, .LBB8_8
+; RV32I-NEXT: j .LBB8_9
+; RV32I-NEXT: .LBB8_7:
+; RV32I-NEXT: neg t3, t4
+; RV32I-NEXT: sll t3, a3, t3
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: mv t3, a7
+; RV32I-NEXT: beqz t4, .LBB8_9
+; RV32I-NEXT: .LBB8_8:
+; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: .LBB8_9:
+; RV32I-NEXT: bltu t4, t1, .LBB8_11
+; RV32I-NEXT: # %bb.10:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: j .LBB8_12
+; RV32I-NEXT: .LBB8_11:
+; RV32I-NEXT: srl t4, a3, t5
+; RV32I-NEXT: .LBB8_12:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lw t2, 8(a1)
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: bltu a2, t1, .LBB8_14
+; RV32I-NEXT: # %bb.13:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s1, t2, a2
+; RV32I-NEXT: j .LBB8_15
+; RV32I-NEXT: .LBB8_14:
+; RV32I-NEXT: sll t6, t2, a2
+; RV32I-NEXT: srl t5, t2, t5
+; RV32I-NEXT: sll s0, a1, a2
+; RV32I-NEXT: or s1, t5, s0
+; RV32I-NEXT: .LBB8_15:
+; RV32I-NEXT: addi s0, a2, -64
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: beqz a2, .LBB8_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t5, s1
+; RV32I-NEXT: .LBB8_17:
+; RV32I-NEXT: bltu s0, t1, .LBB8_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: li t1, 0
+; RV32I-NEXT: sll a7, a7, s0
+; RV32I-NEXT: bnez s0, .LBB8_20
+; RV32I-NEXT: j .LBB8_21
+; RV32I-NEXT: .LBB8_19:
+; RV32I-NEXT: sll t1, a7, a2
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl a7, a7, s1
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: beqz s0, .LBB8_21
+; RV32I-NEXT: .LBB8_20:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB8_21:
+; RV32I-NEXT: bltu a2, a6, .LBB8_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: bnez a2, .LBB8_24
+; RV32I-NEXT: j .LBB8_25
+; RV32I-NEXT: .LBB8_23:
+; RV32I-NEXT: or t1, t3, t6
+; RV32I-NEXT: or a3, t4, t5
+; RV32I-NEXT: beqz a2, .LBB8_25
+; RV32I-NEXT: .LBB8_24:
+; RV32I-NEXT: mv t2, t1
+; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: .LBB8_25:
+; RV32I-NEXT: sw a4, 0(a0)
+; RV32I-NEXT: sw a5, 4(a0)
+; RV32I-NEXT: sw t2, 8(a0)
+; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: shl128:
+; RV64I: # %bb.0:
+; RV64I-NEXT: mv a3, a0
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: bltu a2, a4, .LBB8_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: sub a4, a2, a4
+; RV64I-NEXT: sll a3, a3, a4
+; RV64I-NEXT: bnez a2, .LBB8_3
+; RV64I-NEXT: j .LBB8_4
+; RV64I-NEXT: .LBB8_2:
+; RV64I-NEXT: sll a0, a3, a2
+; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: srl a3, a3, a4
+; RV64I-NEXT: sll a4, a1, a2
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: beqz a2, .LBB8_4
+; RV64I-NEXT: .LBB8_3:
+; RV64I-NEXT: mv a1, a3
+; RV64I-NEXT: .LBB8_4:
+; RV64I-NEXT: ret
+ %1 = shl i128 %a, %b
+ ret i128 %1
+}
+
+define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
+; RV32I-LABEL: fshr64_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a5, a2, 63
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: bltu a5, a4, .LBB9_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a6, a1, a5
+; RV32I-NEXT: j .LBB9_3
+; RV32I-NEXT: .LBB9_2:
+; RV32I-NEXT: srl a3, a0, a2
+; RV32I-NEXT: neg a6, a5
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a6, a3, a6
+; RV32I-NEXT: .LBB9_3:
+; RV32I-NEXT: mv a3, a0
+; RV32I-NEXT: beqz a5, .LBB9_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: .LBB9_5:
+; RV32I-NEXT: neg a6, a2
+; RV32I-NEXT: bltu a5, a4, .LBB9_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a2, 0
+; RV32I-NEXT: j .LBB9_8
+; RV32I-NEXT: .LBB9_7:
+; RV32I-NEXT: srl a2, a1, a2
+; RV32I-NEXT: .LBB9_8:
+; RV32I-NEXT: andi a5, a6, 63
+; RV32I-NEXT: bltu a5, a4, .LBB9_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: sll a0, a0, a5
+; RV32I-NEXT: bnez a5, .LBB9_11
+; RV32I-NEXT: j .LBB9_12
+; RV32I-NEXT: .LBB9_10:
+; RV32I-NEXT: sll a4, a0, a6
+; RV32I-NEXT: neg a7, a5
+; RV32I-NEXT: srl a0, a0, a7
+; RV32I-NEXT: sll a6, a1, a6
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: beqz a5, .LBB9_12
+; RV32I-NEXT: .LBB9_11:
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB9_12:
+; RV32I-NEXT: or a0, a3, a4
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: fshr64_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: srl a1, a0, a1
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+ %res = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
+ ret i64 %res
+}
+
+define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
+; RV32I-LABEL: fshr128_minsize:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw t3, 0(a2)
+; RV32I-NEXT: lw a2, 8(a1)
+; RV32I-NEXT: lw a3, 12(a1)
+; RV32I-NEXT: andi t4, t3, 127
+; RV32I-NEXT: li a6, 32
+; RV32I-NEXT: neg t6, t4
+; RV32I-NEXT: sll t5, a3, t6
+; RV32I-NEXT: bltu t4, a6, .LBB10_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a5, a3, t4
+; RV32I-NEXT: j .LBB10_3
+; RV32I-NEXT: .LBB10_2:
+; RV32I-NEXT: srl a4, a2, t3
+; RV32I-NEXT: or a5, a4, t5
+; RV32I-NEXT: .LBB10_3:
+; RV32I-NEXT: mv a4, a2
+; RV32I-NEXT: beqz t4, .LBB10_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a4, a5
+; RV32I-NEXT: .LBB10_5:
+; RV32I-NEXT: lw a7, 0(a1)
+; RV32I-NEXT: lw a5, 4(a1)
+; RV32I-NEXT: bltu t4, a6, .LBB10_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: srl t2, a5, t4
+; RV32I-NEXT: j .LBB10_8
+; RV32I-NEXT: .LBB10_7:
+; RV32I-NEXT: srl a1, a3, t3
+; RV32I-NEXT: srl t0, a7, t3
+; RV32I-NEXT: sll t1, a5, t6
+; RV32I-NEXT: or t2, t0, t1
+; RV32I-NEXT: .LBB10_8:
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: mv t1, a7
+; RV32I-NEXT: beqz t4, .LBB10_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t1, t2
+; RV32I-NEXT: .LBB10_10:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s0, t0, t4
+; RV32I-NEXT: bltu t4, a6, .LBB10_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: bgeu s0, a6, .LBB10_14
+; RV32I-NEXT: .LBB10_12:
+; RV32I-NEXT: sll t6, a2, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a2, s1
+; RV32I-NEXT: or s2, s1, t5
+; RV32I-NEXT: j .LBB10_15
+; RV32I-NEXT: .LBB10_13:
+; RV32I-NEXT: srl t2, a5, t3
+; RV32I-NEXT: bltu s0, a6, .LBB10_12
+; RV32I-NEXT: .LBB10_14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a2, s0
+; RV32I-NEXT: .LBB10_15:
+; RV32I-NEXT: addi s1, t4, -64
+; RV32I-NEXT: mv t5, a3
+; RV32I-NEXT: beqz s0, .LBB10_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t5, s2
+; RV32I-NEXT: .LBB10_17:
+; RV32I-NEXT: bltu s1, a6, .LBB10_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: srl s2, a3, s1
+; RV32I-NEXT: j .LBB10_20
+; RV32I-NEXT: .LBB10_19:
+; RV32I-NEXT: srl s0, a2, t4
+; RV32I-NEXT: neg s2, s1
+; RV32I-NEXT: sll s2, a3, s2
+; RV32I-NEXT: or s2, s0, s2
+; RV32I-NEXT: .LBB10_20:
+; RV32I-NEXT: mv s0, a2
+; RV32I-NEXT: beqz s1, .LBB10_22
+; RV32I-NEXT: # %bb.21:
+; RV32I-NEXT: mv s0, s2
+; RV32I-NEXT: .LBB10_22:
+; RV32I-NEXT: bltu s1, a6, .LBB10_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bltu t4, t0, .LBB10_25
+; RV32I-NEXT: j .LBB10_26
+; RV32I-NEXT: .LBB10_24:
+; RV32I-NEXT: srl s1, a3, t4
+; RV32I-NEXT: bgeu t4, t0, .LBB10_26
+; RV32I-NEXT: .LBB10_25:
+; RV32I-NEXT: or s0, t1, t6
+; RV32I-NEXT: or s1, t2, t5
+; RV32I-NEXT: .LBB10_26:
+; RV32I-NEXT: mv t1, a7
+; RV32I-NEXT: mv t2, a5
+; RV32I-NEXT: beqz t4, .LBB10_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: mv t1, s0
+; RV32I-NEXT: mv t2, s1
+; RV32I-NEXT: .LBB10_28:
+; RV32I-NEXT: neg t6, t3
+; RV32I-NEXT: bltu t4, t0, .LBB10_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: li a4, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB10_30:
+; RV32I-NEXT: andi t3, t6, 127
+; RV32I-NEXT: neg s2, t3
+; RV32I-NEXT: srl s0, a7, s2
+; RV32I-NEXT: bltu t3, a6, .LBB10_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: sll s3, a7, t3
+; RV32I-NEXT: j .LBB10_33
+; RV32I-NEXT: .LBB10_32:
+; RV32I-NEXT: sll t4, a7, t6
+; RV32I-NEXT: sll t5, a5, t6
+; RV32I-NEXT: or s3, s0, t5
+; RV32I-NEXT: .LBB10_33:
+; RV32I-NEXT: sub s1, t0, t3
+; RV32I-NEXT: mv t5, a5
+; RV32I-NEXT: beqz t3, .LBB10_35
+; RV32I-NEXT: # %bb.34:
+; RV32I-NEXT: mv t5, s3
+; RV32I-NEXT: .LBB10_35:
+; RV32I-NEXT: bltu s1, a6, .LBB10_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: srl s3, a5, s1
+; RV32I-NEXT: j .LBB10_38
+; RV32I-NEXT: .LBB10_37:
+; RV32I-NEXT: neg s3, s1
+; RV32I-NEXT: sll s3, a5, s3
+; RV32I-NEXT: or s3, s0, s3
+; RV32I-NEXT: .LBB10_38:
+; RV32I-NEXT: mv s0, a7
+; RV32I-NEXT: beqz s1, .LBB10_40
+; RV32I-NEXT: # %bb.39:
+; RV32I-NEXT: mv s0, s3
+; RV32I-NEXT: .LBB10_40:
+; RV32I-NEXT: bltu s1, a6, .LBB10_43
+; RV32I-NEXT: # %bb.41:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu t3, a6, .LBB10_44
+; RV32I-NEXT: .LBB10_42:
+; RV32I-NEXT: sll s3, a2, t6
+; RV32I-NEXT: srl s2, a2, s2
+; RV32I-NEXT: sll t6, a3, t6
+; RV32I-NEXT: or s4, s2, t6
+; RV32I-NEXT: j .LBB10_45
+; RV32I-NEXT: .LBB10_43:
+; RV32I-NEXT: srl s1, a5, s2
+; RV32I-NEXT: bltu t3, a6, .LBB10_42
+; RV32I-NEXT: .LBB10_44:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll s4, a2, t3
+; RV32I-NEXT: .LBB10_45:
+; RV32I-NEXT: addi s2, t3, -64
+; RV32I-NEXT: mv t6, a3
+; RV32I-NEXT: beqz t3, .LBB10_47
+; RV32I-NEXT: # %bb.46:
+; RV32I-NEXT: mv t6, s4
+; RV32I-NEXT: .LBB10_47:
+; RV32I-NEXT: bltu s2, a6, .LBB10_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a7, a7, s2
+; RV32I-NEXT: bnez s2, .LBB10_50
+; RV32I-NEXT: j .LBB10_51
+; RV32I-NEXT: .LBB10_49:
+; RV32I-NEXT: sll a6, a7, t3
+; RV32I-NEXT: neg s4, s2
+; RV32I-NEXT: srl a7, a7, s4
+; RV32I-NEXT: sll s4, a5, t3
+; RV32I-NEXT: or a7, a7, s4
+; RV32I-NEXT: beqz s2, .LBB10_51
+; RV32I-NEXT: .LBB10_50:
+; RV32I-NEXT: mv a5, a7
+; RV32I-NEXT: .LBB10_51:
+; RV32I-NEXT: bltu t3, t0, .LBB10_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: bnez t3, .LBB10_54
+; RV32I-NEXT: j .LBB10_55
+; RV32I-NEXT: .LBB10_53:
+; RV32I-NEXT: or a6, s0, s3
+; RV32I-NEXT: or a5, s1, t6
+; RV32I-NEXT: beqz t3, .LBB10_55
+; RV32I-NEXT: .LBB10_54:
+; RV32I-NEXT: mv a2, a6
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: .LBB10_55:
+; RV32I-NEXT: or a5, t1, t4
+; RV32I-NEXT: or a6, t2, t5
+; RV32I-NEXT: or a2, a4, a2
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: sw a5, 0(a0)
+; RV32I-NEXT: sw a6, 4(a0)
+; RV32I-NEXT: sw a2, 8(a0)
+; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: fshr128_minsize:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a5, a2, 127
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: bltu a5, a4, .LBB10_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a3, a5, a4
+; RV64I-NEXT: srl a6, a1, a3
+; RV64I-NEXT: j .LBB10_3
+; RV64I-NEXT: .LBB10_2:
+; RV64I-NEXT: srl a3, a0, a2
+; RV64I-NEXT: negw a6, a5
+; RV64I-NEXT: sll a6, a1, a6
+; RV64I-NEXT: or a6, a3, a6
+; RV64I-NEXT: .LBB10_3:
+; RV64I-NEXT: mv a3, a0
+; RV64I-NEXT: beqz a5, .LBB10_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a3, a6
+; RV64I-NEXT: .LBB10_5:
+; RV64I-NEXT: neg a7, a2
+; RV64I-NEXT: bltu a5, a4, .LBB10_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: li a2, 0
+; RV64I-NEXT: j .LBB10_8
+; RV64I-NEXT: .LBB10_7:
+; RV64I-NEXT: srl a2, a1, a2
+; RV64I-NEXT: .LBB10_8:
+; RV64I-NEXT: andi a6, a7, 127
+; RV64I-NEXT: bltu a6, a4, .LBB10_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: li a5, 0
+; RV64I-NEXT: sub a4, a6, a4
+; RV64I-NEXT: sll a0, a0, a4
+; RV64I-NEXT: bnez a6, .LBB10_11
+; RV64I-NEXT: j .LBB10_12
+; RV64I-NEXT: .LBB10_10:
+; RV64I-NEXT: sll a5, a0, a7
+; RV64I-NEXT: negw a4, a6
+; RV64I-NEXT: srl a0, a0, a4
+; RV64I-NEXT: sll a4, a1, a7
+; RV64I-NEXT: or a0, a0, a4
+; RV64I-NEXT: beqz a6, .LBB10_12
+; RV64I-NEXT: .LBB10_11:
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: .LBB10_12:
+; RV64I-NEXT: or a0, a3, a5
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: ret
+ %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b)
+ ret i128 %res
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
new file mode 100644
index 00000000000000..4ede6932428989
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -0,0 +1,10982 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -global-isel < %s | FileCheck %s -check-prefixes=RV64I
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s | FileCheck %s -check-prefixes=RV32I
+
+define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_4bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a0, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srlw a0, a0, a1
+; RV64I-NEXT: srliw a1, a0, 16
+; RV64I-NEXT: slli a3, a0, 48
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: srli a3, a3, 48
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_4bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: lbu a4, 0(a1)
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: slli a3, a0, 16
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: srli a3, a3, 16
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: sb a4, 3(a2)
+; RV32I-NEXT: ret
+ %src = load i32, ptr %src.ptr, align 1
+ %byteOff = load i32, ptr %byteOff.ptr, align 1
+ %bitOff = shl i32 %byteOff, 3
+ %res = lshr i32 %src, %bitOff
+ store i32 %res, ptr %dst, align 1
+ ret void
+}
+define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_4bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a0, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: sllw a0, a0, a1
+; RV64I-NEXT: srliw a1, a0, 16
+; RV64I-NEXT: slli a3, a0, 48
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: srli a3, a3, 48
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_4bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: lbu a4, 0(a1)
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: slli a3, a0, 16
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: srli a3, a3, 16
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: sb a4, 3(a2)
+; RV32I-NEXT: ret
+ %src = load i32, ptr %src.ptr, align 1
+ %byteOff = load i32, ptr %byteOff.ptr, align 1
+ %bitOff = shl i32 %byteOff, 3
+ %res = shl i32 %src, %bitOff
+ store i32 %res, ptr %dst, align 1
+ ret void
+}
+define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_4bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a0, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: lbu a4, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: sraw a0, a0, a1
+; RV64I-NEXT: srliw a1, a0, 16
+; RV64I-NEXT: slli a3, a0, 48
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: srli a3, a3, 48
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_4bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a0, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: lbu a4, 0(a1)
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: slli a3, a0, 16
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: srli a3, a3, 16
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: sb a4, 3(a2)
+; RV32I-NEXT: ret
+ %src = load i32, ptr %src.ptr, align 1
+ %byteOff = load i32, ptr %byteOff.ptr, align 1
+ %bitOff = shl i32 %byteOff, 3
+ %res = ashr i32 %src, %bitOff
+ store i32 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_8bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t2
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lui a4, 16
+; RV64I-NEXT: addi a4, a4, -1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t2, a5
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srl a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: srliw a3, a0, 16
+; RV64I-NEXT: and a5, a0, a4
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: and a4, a1, a4
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a5, 1(a2)
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_8bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or t1, a0, t1
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: slli a0, a4, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: slli a3, t1, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: bltu a1, a4, .LBB3_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a5, a3, a1
+; RV32I-NEXT: bnez a1, .LBB3_3
+; RV32I-NEXT: j .LBB3_4
+; RV32I-NEXT: .LBB3_2:
+; RV32I-NEXT: srl a5, a0, a1
+; RV32I-NEXT: neg a6, a1
+; RV32I-NEXT: sll a6, a3, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: beqz a1, .LBB3_4
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: mv a0, a5
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: bltu a1, a4, .LBB3_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB3_7
+; RV32I-NEXT: .LBB3_6:
+; RV32I-NEXT: srl a1, a3, a1
+; RV32I-NEXT: .LBB3_7:
+; RV32I-NEXT: srli a3, a0, 16
+; RV32I-NEXT: lui a4, 16
+; RV32I-NEXT: srli a5, a0, 24
+; RV32I-NEXT: srli a6, a1, 16
+; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: addi a4, a4, -1
+; RV32I-NEXT: and t0, a0, a4
+; RV32I-NEXT: and a4, a1, a4
+; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: sb a5, 3(a2)
+; RV32I-NEXT: sb a1, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb a6, 6(a2)
+; RV32I-NEXT: sb a7, 7(a2)
+; RV32I-NEXT: ret
+ %src = load i64, ptr %src.ptr, align 1
+ %byteOff = load i64, ptr %byteOff.ptr, align 1
+ %bitOff = shl i64 %byteOff, 3
+ %res = lshr i64 %src, %bitOff
+ store i64 %res, ptr %dst, align 1
+ ret void
+}
+define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_8bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t2
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lui a4, 16
+; RV64I-NEXT: addi a4, a4, -1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t2, a5
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: sll a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: srliw a3, a0, 16
+; RV64I-NEXT: and a5, a0, a4
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: and a4, a1, a4
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a5, 1(a2)
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_8bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or a0, a0, t1
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: slli a4, a4, 16
+; RV32I-NEXT: or a4, a4, a3
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a3, a1, a6
+; RV32I-NEXT: slli a3, a3, 3
+; RV32I-NEXT: li a1, 32
+; RV32I-NEXT: or a0, a0, a5
+; RV32I-NEXT: bltu a3, a1, .LBB4_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sll a4, a4, a3
+; RV32I-NEXT: bnez a3, .LBB4_3
+; RV32I-NEXT: j .LBB4_4
+; RV32I-NEXT: .LBB4_2:
+; RV32I-NEXT: sll a1, a4, a3
+; RV32I-NEXT: neg a5, a3
+; RV32I-NEXT: srl a4, a4, a5
+; RV32I-NEXT: sll a5, a0, a3
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: beqz a3, .LBB4_4
+; RV32I-NEXT: .LBB4_3:
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB4_4:
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: lui a4, 16
+; RV32I-NEXT: srli a5, a1, 24
+; RV32I-NEXT: srli a6, a0, 16
+; RV32I-NEXT: srli a7, a0, 24
+; RV32I-NEXT: addi a4, a4, -1
+; RV32I-NEXT: and t0, a1, a4
+; RV32I-NEXT: and a4, a0, a4
+; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: sb a5, 3(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb a6, 6(a2)
+; RV32I-NEXT: sb a7, 7(a2)
+; RV32I-NEXT: ret
+ %src = load i64, ptr %src.ptr, align 1
+ %byteOff = load i64, ptr %byteOff.ptr, align 1
+ %bitOff = shl i64 %byteOff, 3
+ %res = shl i64 %src, %bitOff
+ store i64 %res, ptr %dst, align 1
+ ret void
+}
+define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_8bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 0(a1)
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: or t2, t3, t2
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a6, t0, a6
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lui a4, 16
+; RV64I-NEXT: addi a4, a4, -1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t2, a5
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: sra a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: srliw a3, a0, 16
+; RV64I-NEXT: and a5, a0, a4
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: and a4, a1, a4
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb a5, 1(a2)
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_8bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu a0, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or t1, a0, t1
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: slli a0, a4, 16
+; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: slli a3, t1, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: li a4, 32
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: bltu a1, a4, .LBB5_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra a5, a3, a1
+; RV32I-NEXT: bnez a1, .LBB5_3
+; RV32I-NEXT: j .LBB5_4
+; RV32I-NEXT: .LBB5_2:
+; RV32I-NEXT: srl a5, a0, a1
+; RV32I-NEXT: neg a6, a1
+; RV32I-NEXT: sll a6, a3, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: beqz a1, .LBB5_4
+; RV32I-NEXT: .LBB5_3:
+; RV32I-NEXT: mv a0, a5
+; RV32I-NEXT: .LBB5_4:
+; RV32I-NEXT: bltu a1, a4, .LBB5_6
+; RV32I-NEXT: # %bb.5:
+; RV32I-NEXT: srai a1, a3, 31
+; RV32I-NEXT: j .LBB5_7
+; RV32I-NEXT: .LBB5_6:
+; RV32I-NEXT: sra a1, a3, a1
+; RV32I-NEXT: .LBB5_7:
+; RV32I-NEXT: srli a3, a0, 16
+; RV32I-NEXT: lui a4, 16
+; RV32I-NEXT: srli a5, a0, 24
+; RV32I-NEXT: srli a6, a1, 16
+; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: addi a4, a4, -1
+; RV32I-NEXT: and t0, a0, a4
+; RV32I-NEXT: and a4, a1, a4
+; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: sb a5, 3(a2)
+; RV32I-NEXT: sb a1, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb a6, 6(a2)
+; RV32I-NEXT: sb a7, 7(a2)
+; RV32I-NEXT: ret
+ %src = load i64, ptr %src.ptr, align 1
+ %byteOff = load i64, ptr %byteOff.ptr, align 1
+ %bitOff = shl i64 %byteOff, 3
+ %res = ashr i64 %src, %bitOff
+ store i64 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_16bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: or a3, a6, a7
+; RV64I-NEXT: bltu a1, a4, .LBB6_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: srl a5, a3, a5
+; RV64I-NEXT: bnez a1, .LBB6_3
+; RV64I-NEXT: j .LBB6_4
+; RV64I-NEXT: .LBB6_2:
+; RV64I-NEXT: srl a5, a0, a1
+; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: sll a6, a3, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: beqz a1, .LBB6_4
+; RV64I-NEXT: .LBB6_3:
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB6_4:
+; RV64I-NEXT: bltu a1, a4, .LBB6_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: j .LBB6_7
+; RV64I-NEXT: .LBB6_6:
+; RV64I-NEXT: srl a1, a3, a1
+; RV64I-NEXT: .LBB6_7:
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: srliw a4, a0, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli t1, a1, 32
+; RV64I-NEXT: srliw t2, a1, 16
+; RV64I-NEXT: srliw t3, a1, 24
+; RV64I-NEXT: srli t4, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a0, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a0, a3, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a0, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a0, a1, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_16bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a4, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t4, 8(a0)
+; RV32I-NEXT: lbu t5, 9(a0)
+; RV32I-NEXT: lbu t6, 10(a0)
+; RV32I-NEXT: lbu s0, 11(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t3, a7, a6
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t6, a7, a6
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu s0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or t2, a0, t2
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or s1, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s0, a1, s0
+; RV32I-NEXT: li a7, 32
+; RV32I-NEXT: slli a1, a5, 8
+; RV32I-NEXT: slli a0, t0, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: or a6, t5, t4
+; RV32I-NEXT: or t0, t2, t6
+; RV32I-NEXT: or a5, s0, s1
+; RV32I-NEXT: slli a5, a5, 3
+; RV32I-NEXT: srl t2, a6, a5
+; RV32I-NEXT: neg t5, a5
+; RV32I-NEXT: sll t4, t0, t5
+; RV32I-NEXT: bltu a5, a7, .LBB6_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl t6, t0, a5
+; RV32I-NEXT: j .LBB6_3
+; RV32I-NEXT: .LBB6_2:
+; RV32I-NEXT: or t6, t2, t4
+; RV32I-NEXT: .LBB6_3:
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: or a3, a0, a4
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: beqz a5, .LBB6_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a0, t6
+; RV32I-NEXT: .LBB6_5:
+; RV32I-NEXT: or a4, t3, a1
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: bltu a5, a7, .LBB6_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: srl t6, a3, a5
+; RV32I-NEXT: j .LBB6_8
+; RV32I-NEXT: .LBB6_7:
+; RV32I-NEXT: srl a1, t0, a5
+; RV32I-NEXT: srl t1, a4, a5
+; RV32I-NEXT: sll t3, a3, t5
+; RV32I-NEXT: or t6, t1, t3
+; RV32I-NEXT: .LBB6_8:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz a5, .LBB6_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t3, t6
+; RV32I-NEXT: .LBB6_10:
+; RV32I-NEXT: sub s0, t1, a5
+; RV32I-NEXT: bltu a5, a7, .LBB6_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: bgeu s0, a7, .LBB6_14
+; RV32I-NEXT: .LBB6_12:
+; RV32I-NEXT: sll t5, a6, t5
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a6, s1
+; RV32I-NEXT: or s2, s1, t4
+; RV32I-NEXT: j .LBB6_15
+; RV32I-NEXT: .LBB6_13:
+; RV32I-NEXT: srl t6, a3, a5
+; RV32I-NEXT: bltu s0, a7, .LBB6_12
+; RV32I-NEXT: .LBB6_14:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: sll s2, a6, s0
+; RV32I-NEXT: .LBB6_15:
+; RV32I-NEXT: addi s1, a5, -64
+; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: beqz s0, .LBB6_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t4, s2
+; RV32I-NEXT: .LBB6_17:
+; RV32I-NEXT: bltu s1, a7, .LBB6_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: srl t2, t0, s1
+; RV32I-NEXT: bnez s1, .LBB6_20
+; RV32I-NEXT: j .LBB6_21
+; RV32I-NEXT: .LBB6_19:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, t0, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB6_21
+; RV32I-NEXT: .LBB6_20:
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: .LBB6_21:
+; RV32I-NEXT: bltu s1, a7, .LBB6_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: bltu a5, t1, .LBB6_24
+; RV32I-NEXT: j .LBB6_25
+; RV32I-NEXT: .LBB6_23:
+; RV32I-NEXT: srl a7, t0, a5
+; RV32I-NEXT: bgeu a5, t1, .LBB6_25
+; RV32I-NEXT: .LBB6_24:
+; RV32I-NEXT: or a6, t3, t5
+; RV32I-NEXT: or a7, t6, t4
+; RV32I-NEXT: .LBB6_25:
+; RV32I-NEXT: bnez a5, .LBB6_29
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: bltu a5, t1, .LBB6_28
+; RV32I-NEXT: .LBB6_27:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB6_28:
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: lui a6, 16
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a0, 16
+; RV32I-NEXT: srli t3, a0, 24
+; RV32I-NEXT: srli t4, a1, 16
+; RV32I-NEXT: srli t5, a1, 24
+; RV32I-NEXT: addi a6, a6, -1
+; RV32I-NEXT: and t6, a4, a6
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a5, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, a3, a6
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a3, a0, a6
+; RV32I-NEXT: and a4, a1, a6
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB6_29:
+; RV32I-NEXT: mv a4, a6
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: bgeu a5, t1, .LBB6_27
+; RV32I-NEXT: j .LBB6_28
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: or a3, a6, a7
+; RV64I-NEXT: bltu a1, a4, .LBB7_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: srl a5, a3, a5
+; RV64I-NEXT: bnez a1, .LBB7_3
+; RV64I-NEXT: j .LBB7_4
+; RV64I-NEXT: .LBB7_2:
+; RV64I-NEXT: srl a5, a0, a1
+; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: sll a6, a3, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: beqz a1, .LBB7_4
+; RV64I-NEXT: .LBB7_3:
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB7_4:
+; RV64I-NEXT: bltu a1, a4, .LBB7_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: j .LBB7_7
+; RV64I-NEXT: .LBB7_6:
+; RV64I-NEXT: srl a1, a3, a1
+; RV64I-NEXT: .LBB7_7:
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: srliw a4, a0, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli t1, a1, 32
+; RV64I-NEXT: srliw t2, a1, 16
+; RV64I-NEXT: srliw t3, a1, 24
+; RV64I-NEXT: srli t4, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a0, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a0, a3, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a0, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a0, a1, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a4, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t4, 8(a0)
+; RV32I-NEXT: lbu t5, 9(a0)
+; RV32I-NEXT: lbu t6, 10(a0)
+; RV32I-NEXT: lbu s0, 11(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t3, a7, a6
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t6, a7, a6
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu s0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or t2, a0, t2
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or s1, a7, a6
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s0, a1, s0
+; RV32I-NEXT: li a7, 32
+; RV32I-NEXT: slli a1, a5, 8
+; RV32I-NEXT: slli a0, t0, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: or a6, t5, t4
+; RV32I-NEXT: or t0, t2, t6
+; RV32I-NEXT: or a5, s0, s1
+; RV32I-NEXT: slli a5, a5, 5
+; RV32I-NEXT: srl t2, a6, a5
+; RV32I-NEXT: neg t5, a5
+; RV32I-NEXT: sll t4, t0, t5
+; RV32I-NEXT: bltu a5, a7, .LBB7_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl t6, t0, a5
+; RV32I-NEXT: j .LBB7_3
+; RV32I-NEXT: .LBB7_2:
+; RV32I-NEXT: or t6, t2, t4
+; RV32I-NEXT: .LBB7_3:
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: or a3, a0, a4
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: beqz a5, .LBB7_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a0, t6
+; RV32I-NEXT: .LBB7_5:
+; RV32I-NEXT: or a4, t3, a1
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: bltu a5, a7, .LBB7_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: srl t6, a3, a5
+; RV32I-NEXT: j .LBB7_8
+; RV32I-NEXT: .LBB7_7:
+; RV32I-NEXT: srl a1, t0, a5
+; RV32I-NEXT: srl t1, a4, a5
+; RV32I-NEXT: sll t3, a3, t5
+; RV32I-NEXT: or t6, t1, t3
+; RV32I-NEXT: .LBB7_8:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz a5, .LBB7_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t3, t6
+; RV32I-NEXT: .LBB7_10:
+; RV32I-NEXT: sub s0, t1, a5
+; RV32I-NEXT: bltu a5, a7, .LBB7_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: bgeu s0, a7, .LBB7_14
+; RV32I-NEXT: .LBB7_12:
+; RV32I-NEXT: sll t5, a6, t5
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a6, s1
+; RV32I-NEXT: or s2, s1, t4
+; RV32I-NEXT: j .LBB7_15
+; RV32I-NEXT: .LBB7_13:
+; RV32I-NEXT: srl t6, a3, a5
+; RV32I-NEXT: bltu s0, a7, .LBB7_12
+; RV32I-NEXT: .LBB7_14:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: sll s2, a6, s0
+; RV32I-NEXT: .LBB7_15:
+; RV32I-NEXT: addi s1, a5, -64
+; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: beqz s0, .LBB7_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t4, s2
+; RV32I-NEXT: .LBB7_17:
+; RV32I-NEXT: bltu s1, a7, .LBB7_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: srl t2, t0, s1
+; RV32I-NEXT: bnez s1, .LBB7_20
+; RV32I-NEXT: j .LBB7_21
+; RV32I-NEXT: .LBB7_19:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, t0, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB7_21
+; RV32I-NEXT: .LBB7_20:
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: .LBB7_21:
+; RV32I-NEXT: bltu s1, a7, .LBB7_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: bltu a5, t1, .LBB7_24
+; RV32I-NEXT: j .LBB7_25
+; RV32I-NEXT: .LBB7_23:
+; RV32I-NEXT: srl a7, t0, a5
+; RV32I-NEXT: bgeu a5, t1, .LBB7_25
+; RV32I-NEXT: .LBB7_24:
+; RV32I-NEXT: or a6, t3, t5
+; RV32I-NEXT: or a7, t6, t4
+; RV32I-NEXT: .LBB7_25:
+; RV32I-NEXT: bnez a5, .LBB7_29
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: bltu a5, t1, .LBB7_28
+; RV32I-NEXT: .LBB7_27:
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB7_28:
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: lui a6, 16
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a0, 16
+; RV32I-NEXT: srli t3, a0, 24
+; RV32I-NEXT: srli t4, a1, 16
+; RV32I-NEXT: srli t5, a1, 24
+; RV32I-NEXT: addi a6, a6, -1
+; RV32I-NEXT: and t6, a4, a6
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a5, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, a3, a6
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a3, a0, a6
+; RV32I-NEXT: and a4, a1, a6
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB7_29:
+; RV32I-NEXT: mv a4, a6
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: bgeu a5, t1, .LBB7_27
+; RV32I-NEXT: j .LBB7_28
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_16bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a4, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a3, a1, 3
+; RV64I-NEXT: li a5, 64
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: bltu a3, a5, .LBB8_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sub a5, a3, a5
+; RV64I-NEXT: sll a4, a4, a5
+; RV64I-NEXT: bnez a3, .LBB8_3
+; RV64I-NEXT: j .LBB8_4
+; RV64I-NEXT: .LBB8_2:
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: srl a4, a4, a5
+; RV64I-NEXT: sll a5, a0, a3
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: beqz a3, .LBB8_4
+; RV64I-NEXT: .LBB8_3:
+; RV64I-NEXT: mv a0, a4
+; RV64I-NEXT: .LBB8_4:
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: srliw a4, a1, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a1, 24
+; RV64I-NEXT: srli a7, a1, 48
+; RV64I-NEXT: srli t0, a1, 56
+; RV64I-NEXT: srli t1, a0, 32
+; RV64I-NEXT: srliw t2, a0, 16
+; RV64I-NEXT: srliw t3, a0, 24
+; RV64I-NEXT: srli t4, a0, 48
+; RV64I-NEXT: srli t5, a0, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a1, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a1, a3, a5
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a1, a0, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_16bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, a6
+; RV32I-NEXT: li a6, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: li t1, 32
+; RV32I-NEXT: slli a4, a4, 16
+; RV32I-NEXT: slli t2, t0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t0, a4, a3
+; RV32I-NEXT: or a4, t2, a5
+; RV32I-NEXT: or a5, a1, a7
+; RV32I-NEXT: slli a5, a5, 3
+; RV32I-NEXT: neg t3, a5
+; RV32I-NEXT: srl t4, t0, t3
+; RV32I-NEXT: sll t2, a4, a5
+; RV32I-NEXT: bltu a5, t1, .LBB8_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sll t5, t0, a5
+; RV32I-NEXT: j .LBB8_3
+; RV32I-NEXT: .LBB8_2:
+; RV32I-NEXT: sll a1, t0, a5
+; RV32I-NEXT: or t5, t4, t2
+; RV32I-NEXT: .LBB8_3:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu s1, 15(a0)
+; RV32I-NEXT: sub a7, a6, a5
+; RV32I-NEXT: mv a3, a4
+; RV32I-NEXT: beqz a5, .LBB8_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, t5
+; RV32I-NEXT: .LBB8_5:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu s0, 13(a0)
+; RV32I-NEXT: lbu t6, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: bltu a7, t1, .LBB8_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl s4, a4, a7
+; RV32I-NEXT: j .LBB8_8
+; RV32I-NEXT: .LBB8_7:
+; RV32I-NEXT: neg s4, a7
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or s4, t4, s4
+; RV32I-NEXT: .LBB8_8:
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: lbu s6, 8(a0)
+; RV32I-NEXT: lbu s5, 12(a0)
+; RV32I-NEXT: or s3, s3, t5
+; RV32I-NEXT: slli t5, s0, 8
+; RV32I-NEXT: or s1, s1, t6
+; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: beqz a7, .LBB8_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t4, s4
+; RV32I-NEXT: .LBB8_10:
+; RV32I-NEXT: or a0, s2, s6
+; RV32I-NEXT: slli s0, s3, 16
+; RV32I-NEXT: or t6, t5, s5
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: bltu a7, t1, .LBB8_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB8_13
+; RV32I-NEXT: .LBB8_12:
+; RV32I-NEXT: srl t5, a4, t3
+; RV32I-NEXT: .LBB8_13:
+; RV32I-NEXT: or a7, s0, a0
+; RV32I-NEXT: or a0, s1, t6
+; RV32I-NEXT: bltu a5, t1, .LBB8_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s1, a7, a5
+; RV32I-NEXT: j .LBB8_16
+; RV32I-NEXT: .LBB8_15:
+; RV32I-NEXT: sll t6, a7, a5
+; RV32I-NEXT: srl t3, a7, t3
+; RV32I-NEXT: sll s0, a0, a5
+; RV32I-NEXT: or s1, t3, s0
+; RV32I-NEXT: .LBB8_16:
+; RV32I-NEXT: addi s0, a5, -64
+; RV32I-NEXT: mv t3, a0
+; RV32I-NEXT: beqz a5, .LBB8_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, s1
+; RV32I-NEXT: .LBB8_18:
+; RV32I-NEXT: bltu s0, t1, .LBB8_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li t1, 0
+; RV32I-NEXT: sll t0, t0, s0
+; RV32I-NEXT: bnez s0, .LBB8_21
+; RV32I-NEXT: j .LBB8_22
+; RV32I-NEXT: .LBB8_20:
+; RV32I-NEXT: sll t1, t0, a5
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl t0, t0, s1
+; RV32I-NEXT: or t0, t0, t2
+; RV32I-NEXT: beqz s0, .LBB8_22
+; RV32I-NEXT: .LBB8_21:
+; RV32I-NEXT: mv a4, t0
+; RV32I-NEXT: .LBB8_22:
+; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: bltu a5, a6, .LBB8_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: bnez a5, .LBB8_25
+; RV32I-NEXT: j .LBB8_26
+; RV32I-NEXT: .LBB8_24:
+; RV32I-NEXT: or t1, t4, t6
+; RV32I-NEXT: or a4, t5, t3
+; RV32I-NEXT: beqz a5, .LBB8_26
+; RV32I-NEXT: .LBB8_25:
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB8_26:
+; RV32I-NEXT: srli a4, a1, 16
+; RV32I-NEXT: lui a5, 16
+; RV32I-NEXT: srli a6, a1, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a7, 16
+; RV32I-NEXT: srli t3, a7, 24
+; RV32I-NEXT: srli t4, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: addi a5, a5, -1
+; RV32I-NEXT: and t6, a1, a5
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: and a1, a3, a5
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a1, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a1, a7, a5
+; RV32I-NEXT: and a5, a0, a5
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: sb a5, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a4, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a3, a1, 5
+; RV64I-NEXT: li a5, 64
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: bltu a3, a5, .LBB9_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sub a5, a3, a5
+; RV64I-NEXT: sll a4, a4, a5
+; RV64I-NEXT: bnez a3, .LBB9_3
+; RV64I-NEXT: j .LBB9_4
+; RV64I-NEXT: .LBB9_2:
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: srl a4, a4, a5
+; RV64I-NEXT: sll a5, a0, a3
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: beqz a3, .LBB9_4
+; RV64I-NEXT: .LBB9_3:
+; RV64I-NEXT: mv a0, a4
+; RV64I-NEXT: .LBB9_4:
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: srliw a4, a1, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a1, 24
+; RV64I-NEXT: srli a7, a1, 48
+; RV64I-NEXT: srli t0, a1, 56
+; RV64I-NEXT: srli t1, a0, 32
+; RV64I-NEXT: srliw t2, a0, 16
+; RV64I-NEXT: srliw t3, a0, 24
+; RV64I-NEXT: srli t4, a0, 48
+; RV64I-NEXT: srli t5, a0, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a1, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a1, a3, a5
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a1, a0, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: lbu a6, 0(a1)
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, a6
+; RV32I-NEXT: li a6, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: li t1, 32
+; RV32I-NEXT: slli a4, a4, 16
+; RV32I-NEXT: slli t2, t0, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t0, a4, a3
+; RV32I-NEXT: or a4, t2, a5
+; RV32I-NEXT: or a5, a1, a7
+; RV32I-NEXT: slli a5, a5, 5
+; RV32I-NEXT: neg t3, a5
+; RV32I-NEXT: srl t4, t0, t3
+; RV32I-NEXT: sll t2, a4, a5
+; RV32I-NEXT: bltu a5, t1, .LBB9_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sll t5, t0, a5
+; RV32I-NEXT: j .LBB9_3
+; RV32I-NEXT: .LBB9_2:
+; RV32I-NEXT: sll a1, t0, a5
+; RV32I-NEXT: or t5, t4, t2
+; RV32I-NEXT: .LBB9_3:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu s1, 15(a0)
+; RV32I-NEXT: sub a7, a6, a5
+; RV32I-NEXT: mv a3, a4
+; RV32I-NEXT: beqz a5, .LBB9_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, t5
+; RV32I-NEXT: .LBB9_5:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu s0, 13(a0)
+; RV32I-NEXT: lbu t6, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: bltu a7, t1, .LBB9_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl s4, a4, a7
+; RV32I-NEXT: j .LBB9_8
+; RV32I-NEXT: .LBB9_7:
+; RV32I-NEXT: neg s4, a7
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or s4, t4, s4
+; RV32I-NEXT: .LBB9_8:
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: lbu s6, 8(a0)
+; RV32I-NEXT: lbu s5, 12(a0)
+; RV32I-NEXT: or s3, s3, t5
+; RV32I-NEXT: slli t5, s0, 8
+; RV32I-NEXT: or s1, s1, t6
+; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: beqz a7, .LBB9_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t4, s4
+; RV32I-NEXT: .LBB9_10:
+; RV32I-NEXT: or a0, s2, s6
+; RV32I-NEXT: slli s0, s3, 16
+; RV32I-NEXT: or t6, t5, s5
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: bltu a7, t1, .LBB9_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB9_13
+; RV32I-NEXT: .LBB9_12:
+; RV32I-NEXT: srl t5, a4, t3
+; RV32I-NEXT: .LBB9_13:
+; RV32I-NEXT: or a7, s0, a0
+; RV32I-NEXT: or a0, s1, t6
+; RV32I-NEXT: bltu a5, t1, .LBB9_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s1, a7, a5
+; RV32I-NEXT: j .LBB9_16
+; RV32I-NEXT: .LBB9_15:
+; RV32I-NEXT: sll t6, a7, a5
+; RV32I-NEXT: srl t3, a7, t3
+; RV32I-NEXT: sll s0, a0, a5
+; RV32I-NEXT: or s1, t3, s0
+; RV32I-NEXT: .LBB9_16:
+; RV32I-NEXT: addi s0, a5, -64
+; RV32I-NEXT: mv t3, a0
+; RV32I-NEXT: beqz a5, .LBB9_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, s1
+; RV32I-NEXT: .LBB9_18:
+; RV32I-NEXT: bltu s0, t1, .LBB9_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li t1, 0
+; RV32I-NEXT: sll t0, t0, s0
+; RV32I-NEXT: bnez s0, .LBB9_21
+; RV32I-NEXT: j .LBB9_22
+; RV32I-NEXT: .LBB9_20:
+; RV32I-NEXT: sll t1, t0, a5
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl t0, t0, s1
+; RV32I-NEXT: or t0, t0, t2
+; RV32I-NEXT: beqz s0, .LBB9_22
+; RV32I-NEXT: .LBB9_21:
+; RV32I-NEXT: mv a4, t0
+; RV32I-NEXT: .LBB9_22:
+; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: bltu a5, a6, .LBB9_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: bnez a5, .LBB9_25
+; RV32I-NEXT: j .LBB9_26
+; RV32I-NEXT: .LBB9_24:
+; RV32I-NEXT: or t1, t4, t6
+; RV32I-NEXT: or a4, t5, t3
+; RV32I-NEXT: beqz a5, .LBB9_26
+; RV32I-NEXT: .LBB9_25:
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: .LBB9_26:
+; RV32I-NEXT: srli a4, a1, 16
+; RV32I-NEXT: lui a5, 16
+; RV32I-NEXT: srli a6, a1, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a7, 16
+; RV32I-NEXT: srli t3, a7, 24
+; RV32I-NEXT: srli t4, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: addi a5, a5, -1
+; RV32I-NEXT: and t6, a1, a5
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: and a1, a3, a5
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a1, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a1, a7, a5
+; RV32I-NEXT: and a5, a0, a5
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: sb a5, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+
+define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_16bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: or a3, a6, a7
+; RV64I-NEXT: bltu a1, a4, .LBB10_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: sra a5, a3, a5
+; RV64I-NEXT: bnez a1, .LBB10_3
+; RV64I-NEXT: j .LBB10_4
+; RV64I-NEXT: .LBB10_2:
+; RV64I-NEXT: srl a5, a0, a1
+; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: sll a6, a3, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: beqz a1, .LBB10_4
+; RV64I-NEXT: .LBB10_3:
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB10_4:
+; RV64I-NEXT: bltu a1, a4, .LBB10_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: srai a1, a3, 63
+; RV64I-NEXT: j .LBB10_7
+; RV64I-NEXT: .LBB10_6:
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: .LBB10_7:
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: srliw a4, a0, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli t1, a1, 32
+; RV64I-NEXT: srliw t2, a1, 16
+; RV64I-NEXT: srliw t3, a1, 24
+; RV64I-NEXT: srli t4, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a0, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a0, a3, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a0, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a0, a1, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_16bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a4, 4(a0)
+; RV32I-NEXT: lbu t2, 5(a0)
+; RV32I-NEXT: lbu t0, 6(a0)
+; RV32I-NEXT: lbu t1, 7(a0)
+; RV32I-NEXT: lbu t4, 8(a0)
+; RV32I-NEXT: lbu t5, 9(a0)
+; RV32I-NEXT: lbu t6, 10(a0)
+; RV32I-NEXT: lbu s0, 11(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t3, a7, a6
+; RV32I-NEXT: or t1, t1, t0
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t6, 1(a1)
+; RV32I-NEXT: lbu s0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or s1, a0, t0
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: or t6, t6, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s0, a1, s0
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: slli a1, a5, 8
+; RV32I-NEXT: slli a0, t2, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or a5, s1, a6
+; RV32I-NEXT: or a6, s0, t6
+; RV32I-NEXT: slli a6, a6, 3
+; RV32I-NEXT: srl t2, a7, a6
+; RV32I-NEXT: neg t6, a6
+; RV32I-NEXT: sll t4, a5, t6
+; RV32I-NEXT: bltu a6, t0, .LBB10_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t5, a5, a6
+; RV32I-NEXT: j .LBB10_3
+; RV32I-NEXT: .LBB10_2:
+; RV32I-NEXT: or t5, t2, t4
+; RV32I-NEXT: .LBB10_3:
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: or a3, a0, a4
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: beqz a6, .LBB10_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a0, t5
+; RV32I-NEXT: .LBB10_5:
+; RV32I-NEXT: or a4, t3, a1
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: bltu a6, t0, .LBB10_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai a1, a5, 31
+; RV32I-NEXT: srl t5, a3, a6
+; RV32I-NEXT: j .LBB10_8
+; RV32I-NEXT: .LBB10_7:
+; RV32I-NEXT: sra a1, a5, a6
+; RV32I-NEXT: srl t1, a4, a6
+; RV32I-NEXT: sll t3, a3, t6
+; RV32I-NEXT: or t5, t1, t3
+; RV32I-NEXT: .LBB10_8:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz a6, .LBB10_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: .LBB10_10:
+; RV32I-NEXT: sub s0, t1, a6
+; RV32I-NEXT: bltu a6, t0, .LBB10_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: bgeu s0, t0, .LBB10_14
+; RV32I-NEXT: .LBB10_12:
+; RV32I-NEXT: sll t6, a7, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a7, s1
+; RV32I-NEXT: or s2, s1, t4
+; RV32I-NEXT: j .LBB10_15
+; RV32I-NEXT: .LBB10_13:
+; RV32I-NEXT: srl t5, a3, a6
+; RV32I-NEXT: bltu s0, t0, .LBB10_12
+; RV32I-NEXT: .LBB10_14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a7, s0
+; RV32I-NEXT: .LBB10_15:
+; RV32I-NEXT: addi s1, a6, -64
+; RV32I-NEXT: mv t4, a5
+; RV32I-NEXT: beqz s0, .LBB10_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t4, s2
+; RV32I-NEXT: .LBB10_17:
+; RV32I-NEXT: bltu s1, t0, .LBB10_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: sra t2, a5, s1
+; RV32I-NEXT: bnez s1, .LBB10_20
+; RV32I-NEXT: j .LBB10_21
+; RV32I-NEXT: .LBB10_19:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, a5, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB10_21
+; RV32I-NEXT: .LBB10_20:
+; RV32I-NEXT: mv a7, t2
+; RV32I-NEXT: .LBB10_21:
+; RV32I-NEXT: bltu s1, t0, .LBB10_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: srai t0, a5, 31
+; RV32I-NEXT: bltu a6, t1, .LBB10_24
+; RV32I-NEXT: j .LBB10_25
+; RV32I-NEXT: .LBB10_23:
+; RV32I-NEXT: sra t0, a5, a6
+; RV32I-NEXT: bgeu a6, t1, .LBB10_25
+; RV32I-NEXT: .LBB10_24:
+; RV32I-NEXT: or a7, t3, t6
+; RV32I-NEXT: or t0, t5, t4
+; RV32I-NEXT: .LBB10_25:
+; RV32I-NEXT: bnez a6, .LBB10_29
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: bltu a6, t1, .LBB10_28
+; RV32I-NEXT: .LBB10_27:
+; RV32I-NEXT: srai a0, a5, 31
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB10_28:
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: lui a6, 16
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a0, 16
+; RV32I-NEXT: srli t3, a0, 24
+; RV32I-NEXT: srli t4, a1, 16
+; RV32I-NEXT: srli t5, a1, 24
+; RV32I-NEXT: addi a6, a6, -1
+; RV32I-NEXT: and t6, a4, a6
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a5, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, a3, a6
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a3, a0, a6
+; RV32I-NEXT: and a4, a1, a6
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB10_29:
+; RV32I-NEXT: mv a4, a7
+; RV32I-NEXT: mv a3, t0
+; RV32I-NEXT: bgeu a6, t1, .LBB10_27
+; RV32I-NEXT: j .LBB10_28
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu s0, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or t1, t4, t3
+; RV64I-NEXT: or t2, t6, t5
+; RV64I-NEXT: lbu t3, 0(a1)
+; RV64I-NEXT: lbu t4, 1(a1)
+; RV64I-NEXT: lbu t5, 2(a1)
+; RV64I-NEXT: lbu t6, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a0, a0, s0
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t3, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu s0, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or a1, a1, s0
+; RV64I-NEXT: slli a4, a4, 16
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: slli a0, a0, 16
+; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: or a5, t5, a6
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t3
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a0, a4, a3
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: li a4, 64
+; RV64I-NEXT: or a3, a6, a7
+; RV64I-NEXT: bltu a1, a4, .LBB11_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: sra a5, a3, a5
+; RV64I-NEXT: bnez a1, .LBB11_3
+; RV64I-NEXT: j .LBB11_4
+; RV64I-NEXT: .LBB11_2:
+; RV64I-NEXT: srl a5, a0, a1
+; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: sll a6, a3, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: beqz a1, .LBB11_4
+; RV64I-NEXT: .LBB11_3:
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB11_4:
+; RV64I-NEXT: bltu a1, a4, .LBB11_6
+; RV64I-NEXT: # %bb.5:
+; RV64I-NEXT: srai a1, a3, 63
+; RV64I-NEXT: j .LBB11_7
+; RV64I-NEXT: .LBB11_6:
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: .LBB11_7:
+; RV64I-NEXT: srli a3, a0, 32
+; RV64I-NEXT: srliw a4, a0, 16
+; RV64I-NEXT: lui a5, 16
+; RV64I-NEXT: srliw a6, a0, 24
+; RV64I-NEXT: srli a7, a0, 48
+; RV64I-NEXT: srli t0, a0, 56
+; RV64I-NEXT: srli t1, a1, 32
+; RV64I-NEXT: srliw t2, a1, 16
+; RV64I-NEXT: srliw t3, a1, 24
+; RV64I-NEXT: srli t4, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: addi a5, a5, -1
+; RV64I-NEXT: and t6, a0, a5
+; RV64I-NEXT: srli t6, t6, 8
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: sb t6, 1(a2)
+; RV64I-NEXT: sb a4, 2(a2)
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: and a0, a3, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: sb a0, 5(a2)
+; RV64I-NEXT: sb a7, 6(a2)
+; RV64I-NEXT: sb t0, 7(a2)
+; RV64I-NEXT: and a0, a1, a5
+; RV64I-NEXT: and a3, t1, a5
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: sb t2, 10(a2)
+; RV64I-NEXT: sb t3, 11(a2)
+; RV64I-NEXT: sb t1, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t4, 14(a2)
+; RV64I-NEXT: sb t5, 15(a2)
+; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a5, 1(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a4, 4(a0)
+; RV32I-NEXT: lbu t2, 5(a0)
+; RV32I-NEXT: lbu t0, 6(a0)
+; RV32I-NEXT: lbu t1, 7(a0)
+; RV32I-NEXT: lbu t4, 8(a0)
+; RV32I-NEXT: lbu t5, 9(a0)
+; RV32I-NEXT: lbu t6, 10(a0)
+; RV32I-NEXT: lbu s0, 11(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t3, a7, a6
+; RV32I-NEXT: or t1, t1, t0
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t6, 1(a1)
+; RV32I-NEXT: lbu s0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: or s1, a0, t0
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: or t6, t6, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s0, a1, s0
+; RV32I-NEXT: li t0, 32
+; RV32I-NEXT: slli a1, a5, 8
+; RV32I-NEXT: slli a0, t2, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or a5, s1, a6
+; RV32I-NEXT: or a6, s0, t6
+; RV32I-NEXT: slli a6, a6, 5
+; RV32I-NEXT: srl t2, a7, a6
+; RV32I-NEXT: neg t6, a6
+; RV32I-NEXT: sll t4, a5, t6
+; RV32I-NEXT: bltu a6, t0, .LBB11_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t5, a5, a6
+; RV32I-NEXT: j .LBB11_3
+; RV32I-NEXT: .LBB11_2:
+; RV32I-NEXT: or t5, t2, t4
+; RV32I-NEXT: .LBB11_3:
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: or a3, a0, a4
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: beqz a6, .LBB11_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a0, t5
+; RV32I-NEXT: .LBB11_5:
+; RV32I-NEXT: or a4, t3, a1
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: bltu a6, t0, .LBB11_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai a1, a5, 31
+; RV32I-NEXT: srl t5, a3, a6
+; RV32I-NEXT: j .LBB11_8
+; RV32I-NEXT: .LBB11_7:
+; RV32I-NEXT: sra a1, a5, a6
+; RV32I-NEXT: srl t1, a4, a6
+; RV32I-NEXT: sll t3, a3, t6
+; RV32I-NEXT: or t5, t1, t3
+; RV32I-NEXT: .LBB11_8:
+; RV32I-NEXT: li t1, 64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz a6, .LBB11_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: .LBB11_10:
+; RV32I-NEXT: sub s0, t1, a6
+; RV32I-NEXT: bltu a6, t0, .LBB11_13
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: bgeu s0, t0, .LBB11_14
+; RV32I-NEXT: .LBB11_12:
+; RV32I-NEXT: sll t6, a7, t6
+; RV32I-NEXT: neg s1, s0
+; RV32I-NEXT: srl s1, a7, s1
+; RV32I-NEXT: or s2, s1, t4
+; RV32I-NEXT: j .LBB11_15
+; RV32I-NEXT: .LBB11_13:
+; RV32I-NEXT: srl t5, a3, a6
+; RV32I-NEXT: bltu s0, t0, .LBB11_12
+; RV32I-NEXT: .LBB11_14:
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: sll s2, a7, s0
+; RV32I-NEXT: .LBB11_15:
+; RV32I-NEXT: addi s1, a6, -64
+; RV32I-NEXT: mv t4, a5
+; RV32I-NEXT: beqz s0, .LBB11_17
+; RV32I-NEXT: # %bb.16:
+; RV32I-NEXT: mv t4, s2
+; RV32I-NEXT: .LBB11_17:
+; RV32I-NEXT: bltu s1, t0, .LBB11_19
+; RV32I-NEXT: # %bb.18:
+; RV32I-NEXT: sra t2, a5, s1
+; RV32I-NEXT: bnez s1, .LBB11_20
+; RV32I-NEXT: j .LBB11_21
+; RV32I-NEXT: .LBB11_19:
+; RV32I-NEXT: neg s0, s1
+; RV32I-NEXT: sll s0, a5, s0
+; RV32I-NEXT: or t2, t2, s0
+; RV32I-NEXT: beqz s1, .LBB11_21
+; RV32I-NEXT: .LBB11_20:
+; RV32I-NEXT: mv a7, t2
+; RV32I-NEXT: .LBB11_21:
+; RV32I-NEXT: bltu s1, t0, .LBB11_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: srai t0, a5, 31
+; RV32I-NEXT: bltu a6, t1, .LBB11_24
+; RV32I-NEXT: j .LBB11_25
+; RV32I-NEXT: .LBB11_23:
+; RV32I-NEXT: sra t0, a5, a6
+; RV32I-NEXT: bgeu a6, t1, .LBB11_25
+; RV32I-NEXT: .LBB11_24:
+; RV32I-NEXT: or a7, t3, t6
+; RV32I-NEXT: or t0, t5, t4
+; RV32I-NEXT: .LBB11_25:
+; RV32I-NEXT: bnez a6, .LBB11_29
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: bltu a6, t1, .LBB11_28
+; RV32I-NEXT: .LBB11_27:
+; RV32I-NEXT: srai a0, a5, 31
+; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: .LBB11_28:
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: lui a6, 16
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: srli t0, a3, 16
+; RV32I-NEXT: srli t1, a3, 24
+; RV32I-NEXT: srli t2, a0, 16
+; RV32I-NEXT: srli t3, a0, 24
+; RV32I-NEXT: srli t4, a1, 16
+; RV32I-NEXT: srli t5, a1, 24
+; RV32I-NEXT: addi a6, a6, -1
+; RV32I-NEXT: and t6, a4, a6
+; RV32I-NEXT: srli t6, t6, 8
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb t6, 1(a2)
+; RV32I-NEXT: sb a5, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, a3, a6
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: sb t0, 6(a2)
+; RV32I-NEXT: sb t1, 7(a2)
+; RV32I-NEXT: and a3, a0, a6
+; RV32I-NEXT: and a4, a1, a6
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: sb t2, 10(a2)
+; RV32I-NEXT: sb t3, 11(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: sb t4, 14(a2)
+; RV32I-NEXT: sb t5, 15(a2)
+; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB11_29:
+; RV32I-NEXT: mv a4, a7
+; RV32I-NEXT: mv a3, t0
+; RV32I-NEXT: bgeu a6, t1, .LBB11_27
+; RV32I-NEXT: j .LBB11_28
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or t0, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t3, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li a7, 64
+; RV64I-NEXT: slli t4, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a6, t1, t0
+; RV64I-NEXT: or t0, t5, t3
+; RV64I-NEXT: or a5, s0, t6
+; RV64I-NEXT: slli a5, a5, 3
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sll t3, t0, t5
+; RV64I-NEXT: bltu a5, a7, .LBB12_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: srl t6, t0, t1
+; RV64I-NEXT: j .LBB12_3
+; RV64I-NEXT: .LBB12_2:
+; RV64I-NEXT: srl t6, a6, a5
+; RV64I-NEXT: or t6, t6, t3
+; RV64I-NEXT: .LBB12_3:
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli t4, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a6
+; RV64I-NEXT: beqz a5, .LBB12_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB12_5:
+; RV64I-NEXT: or a4, t4, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a5, a7, .LBB12_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: srl t4, a3, t1
+; RV64I-NEXT: j .LBB12_8
+; RV64I-NEXT: .LBB12_7:
+; RV64I-NEXT: srl a0, t0, a5
+; RV64I-NEXT: srl t1, a4, a5
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t4, t1, t2
+; RV64I-NEXT: .LBB12_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a5, .LBB12_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t4
+; RV64I-NEXT: .LBB12_10:
+; RV64I-NEXT: sub t6, t1, a5
+; RV64I-NEXT: bltu a5, a7, .LBB12_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t4, 0
+; RV64I-NEXT: bgeu t6, a7, .LBB12_14
+; RV64I-NEXT: .LBB12_12:
+; RV64I-NEXT: sll t5, a6, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a6, s0
+; RV64I-NEXT: or s1, s0, t3
+; RV64I-NEXT: j .LBB12_15
+; RV64I-NEXT: .LBB12_13:
+; RV64I-NEXT: srl t4, a3, a5
+; RV64I-NEXT: bltu t6, a7, .LBB12_12
+; RV64I-NEXT: .LBB12_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: sll s1, a6, t3
+; RV64I-NEXT: .LBB12_15:
+; RV64I-NEXT: sub s0, a5, t1
+; RV64I-NEXT: mv t3, t0
+; RV64I-NEXT: beqz t6, .LBB12_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t3, s1
+; RV64I-NEXT: .LBB12_17:
+; RV64I-NEXT: bltu s0, a7, .LBB12_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: srl t6, t0, t6
+; RV64I-NEXT: bnez s0, .LBB12_20
+; RV64I-NEXT: j .LBB12_21
+; RV64I-NEXT: .LBB12_19:
+; RV64I-NEXT: srl t6, a6, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, t0, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB12_21
+; RV64I-NEXT: .LBB12_20:
+; RV64I-NEXT: mv a6, t6
+; RV64I-NEXT: .LBB12_21:
+; RV64I-NEXT: bltu s0, a7, .LBB12_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: li a7, 0
+; RV64I-NEXT: bltu a5, t1, .LBB12_24
+; RV64I-NEXT: j .LBB12_25
+; RV64I-NEXT: .LBB12_23:
+; RV64I-NEXT: srl a7, t0, s0
+; RV64I-NEXT: bgeu a5, t1, .LBB12_25
+; RV64I-NEXT: .LBB12_24:
+; RV64I-NEXT: or a6, t2, t5
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: .LBB12_25:
+; RV64I-NEXT: bnez a5, .LBB12_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a5, t1, .LBB12_28
+; RV64I-NEXT: .LBB12_27:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: .LBB12_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB12_29:
+; RV64I-NEXT: mv a4, a6
+; RV64I-NEXT: mv a3, a7
+; RV64I-NEXT: bgeu a5, t1, .LBB12_27
+; RV64I-NEXT: j .LBB12_28
+;
+; RV32I-LABEL: lshr_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a6, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a4, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t4, t4, t3
+; RV32I-NEXT: or t5, t6, t5
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t3, 32
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a1, a4, 16
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a4, t1, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: or t1, t5, t4
+; RV32I-NEXT: or t5, a4, a7
+; RV32I-NEXT: or a4, t2, t0
+; RV32I-NEXT: slli a4, a4, 3
+; RV32I-NEXT: srl s0, t1, a4
+; RV32I-NEXT: neg s6, a4
+; RV32I-NEXT: sll t4, t5, s6
+; RV32I-NEXT: bltu a4, t3, .LBB12_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, t5, a4
+; RV32I-NEXT: j .LBB12_3
+; RV32I-NEXT: .LBB12_2:
+; RV32I-NEXT: or a7, s0, t4
+; RV32I-NEXT: .LBB12_3:
+; RV32I-NEXT: or t0, a6, a3
+; RV32I-NEXT: or a6, a1, a5
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: beqz a4, .LBB12_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a1, a7
+; RV32I-NEXT: .LBB12_5:
+; RV32I-NEXT: srl a3, t0, a4
+; RV32I-NEXT: sll a5, a6, s6
+; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t3, .LBB12_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: srl a3, a6, a4
+; RV32I-NEXT: j .LBB12_8
+; RV32I-NEXT: .LBB12_7:
+; RV32I-NEXT: srl ra, t5, a4
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: .LBB12_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv a7, t0
+; RV32I-NEXT: beqz a4, .LBB12_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a7, a3
+; RV32I-NEXT: .LBB12_10:
+; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s7, t6, a4
+; RV32I-NEXT: bltu a4, t3, .LBB12_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: j .LBB12_13
+; RV32I-NEXT: .LBB12_12:
+; RV32I-NEXT: srl a5, a6, a4
+; RV32I-NEXT: .LBB12_13:
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t3, .LBB12_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: sll a3, t1, s7
+; RV32I-NEXT: j .LBB12_16
+; RV32I-NEXT: .LBB12_15:
+; RV32I-NEXT: sll t2, t1, s6
+; RV32I-NEXT: srl a3, t1, s10
+; RV32I-NEXT: or a3, a3, t4
+; RV32I-NEXT: .LBB12_16:
+; RV32I-NEXT: addi s9, a4, -64
+; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: beqz s7, .LBB12_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t4, a3
+; RV32I-NEXT: .LBB12_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB12_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: srl s2, t5, s9
+; RV32I-NEXT: j .LBB12_21
+; RV32I-NEXT: .LBB12_20:
+; RV32I-NEXT: sll a3, t5, s11
+; RV32I-NEXT: or s2, s0, a3
+; RV32I-NEXT: .LBB12_21:
+; RV32I-NEXT: lbu s1, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: beqz s9, .LBB12_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv s0, s2
+; RV32I-NEXT: .LBB12_23:
+; RV32I-NEXT: lbu s4, 9(a0)
+; RV32I-NEXT: lbu s2, 10(a0)
+; RV32I-NEXT: lbu s5, 13(a0)
+; RV32I-NEXT: lbu s8, 14(a0)
+; RV32I-NEXT: slli s3, s1, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB12_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: j .LBB12_26
+; RV32I-NEXT: .LBB12_25:
+; RV32I-NEXT: srl s1, t5, a4
+; RV32I-NEXT: .LBB12_26:
+; RV32I-NEXT: or s2, s3, s2
+; RV32I-NEXT: lbu ra, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or s8, a3, s8
+; RV32I-NEXT: bgeu a4, t6, .LBB12_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or s0, a7, t2
+; RV32I-NEXT: or s1, a5, t4
+; RV32I-NEXT: .LBB12_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: or a5, s4, ra
+; RV32I-NEXT: slli t4, s2, 16
+; RV32I-NEXT: or s2, s5, s3
+; RV32I-NEXT: slli s3, s8, 16
+; RV32I-NEXT: mv s4, t0
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: beqz a4, .LBB12_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv s4, s0
+; RV32I-NEXT: mv a7, s1
+; RV32I-NEXT: .LBB12_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s1, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s8, t2, 8
+; RV32I-NEXT: or t4, t4, a5
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: bltu a4, t6, .LBB12_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB12_32:
+; RV32I-NEXT: slli s3, ra, 8
+; RV32I-NEXT: or a5, s5, a3
+; RV32I-NEXT: lbu s5, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a3, s8, s0
+; RV32I-NEXT: srl s2, t4, a4
+; RV32I-NEXT: sll ra, t2, s6
+; RV32I-NEXT: bltu a4, t3, .LBB12_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s0, t2, a4
+; RV32I-NEXT: j .LBB12_35
+; RV32I-NEXT: .LBB12_34:
+; RV32I-NEXT: or s0, s2, ra
+; RV32I-NEXT: .LBB12_35:
+; RV32I-NEXT: or s3, s3, s5
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a4, .LBB12_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s0
+; RV32I-NEXT: .LBB12_37:
+; RV32I-NEXT: or s0, a5, s3
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: bltu a4, t3, .LBB12_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: srl a3, a0, a4
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: bnez a4, .LBB12_40
+; RV32I-NEXT: j .LBB12_41
+; RV32I-NEXT: .LBB12_39:
+; RV32I-NEXT: srl s8, t2, a4
+; RV32I-NEXT: srl a3, s0, a4
+; RV32I-NEXT: sll a5, a0, s6
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: beqz a4, .LBB12_41
+; RV32I-NEXT: .LBB12_40:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB12_41:
+; RV32I-NEXT: bltu a4, t3, .LBB12_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t3, .LBB12_45
+; RV32I-NEXT: .LBB12_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: bnez s7, .LBB12_46
+; RV32I-NEXT: j .LBB12_47
+; RV32I-NEXT: .LBB12_44:
+; RV32I-NEXT: srl s1, a0, a4
+; RV32I-NEXT: bltu s7, t3, .LBB12_43
+; RV32I-NEXT: .LBB12_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: beqz s7, .LBB12_47
+; RV32I-NEXT: .LBB12_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB12_47:
+; RV32I-NEXT: bltu s9, t3, .LBB12_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t2, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB12_50
+; RV32I-NEXT: j .LBB12_51
+; RV32I-NEXT: .LBB12_49:
+; RV32I-NEXT: sll a3, t2, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB12_51
+; RV32I-NEXT: .LBB12_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB12_51:
+; RV32I-NEXT: bltu s9, t3, .LBB12_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a4, t6, .LBB12_54
+; RV32I-NEXT: j .LBB12_55
+; RV32I-NEXT: .LBB12_53:
+; RV32I-NEXT: srl s7, t2, a4
+; RV32I-NEXT: bgeu a4, t6, .LBB12_55
+; RV32I-NEXT: .LBB12_54:
+; RV32I-NEXT: or s2, a5, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB12_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a4, .LBB12_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a5, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB12_57:
+; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a4
+; RV32I-NEXT: bltu a4, t6, .LBB12_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: .LBB12_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a5, t0, s3
+; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB12_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a3, t0, s2
+; RV32I-NEXT: j .LBB12_62
+; RV32I-NEXT: .LBB12_61:
+; RV32I-NEXT: sll s10, t0, s6
+; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: .LBB12_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a6
+; RV32I-NEXT: beqz s2, .LBB12_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB12_64:
+; RV32I-NEXT: bltu s1, t3, .LBB12_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a6, s1
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez s1, .LBB12_67
+; RV32I-NEXT: j .LBB12_68
+; RV32I-NEXT: .LBB12_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a6, a3
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz s1, .LBB12_68
+; RV32I-NEXT: .LBB12_67:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB12_68:
+; RV32I-NEXT: bltu s1, t3, .LBB12_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t3, .LBB12_72
+; RV32I-NEXT: .LBB12_70:
+; RV32I-NEXT: sll s6, t1, s6
+; RV32I-NEXT: srl a3, t1, s3
+; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB12_73
+; RV32I-NEXT: .LBB12_71:
+; RV32I-NEXT: srl s1, a6, s3
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB12_70
+; RV32I-NEXT: .LBB12_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: .LBB12_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, t5
+; RV32I-NEXT: beqz s2, .LBB12_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB12_75:
+; RV32I-NEXT: bltu s9, t3, .LBB12_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t0, s9
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: bnez s9, .LBB12_78
+; RV32I-NEXT: j .LBB12_79
+; RV32I-NEXT: .LBB12_77:
+; RV32I-NEXT: sll s3, t0, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t0, a3
+; RV32I-NEXT: sll s7, a6, s2
+; RV32I-NEXT: or a3, a3, s7
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: beqz s9, .LBB12_79
+; RV32I-NEXT: .LBB12_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB12_79:
+; RV32I-NEXT: bltu s2, t6, .LBB12_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB12_82
+; RV32I-NEXT: .LBB12_81:
+; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s3, a5, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB12_82:
+; RV32I-NEXT: addi ra, a4, -128
+; RV32I-NEXT: mv s5, t1
+; RV32I-NEXT: mv s6, t5
+; RV32I-NEXT: beqz s2, .LBB12_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s5, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB12_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, t5, s9
+; RV32I-NEXT: bltu ra, t3, .LBB12_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: srl a3, t5, ra
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: bnez ra, .LBB12_87
+; RV32I-NEXT: j .LBB12_88
+; RV32I-NEXT: .LBB12_86:
+; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: beqz ra, .LBB12_88
+; RV32I-NEXT: .LBB12_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB12_88:
+; RV32I-NEXT: bltu ra, t3, .LBB12_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: srl a3, a6, ra
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez ra, .LBB12_91
+; RV32I-NEXT: j .LBB12_92
+; RV32I-NEXT: .LBB12_90:
+; RV32I-NEXT: srl s2, t5, a4
+; RV32I-NEXT: sll a3, a6, s9
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz ra, .LBB12_92
+; RV32I-NEXT: .LBB12_91:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB12_92:
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: bltu ra, t3, .LBB12_95
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bgeu s10, t3, .LBB12_96
+; RV32I-NEXT: .LBB12_94:
+; RV32I-NEXT: sll s9, t1, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB12_97
+; RV32I-NEXT: .LBB12_95:
+; RV32I-NEXT: srl s7, a6, a4
+; RV32I-NEXT: bltu s10, t3, .LBB12_94
+; RV32I-NEXT: .LBB12_96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t1, s10
+; RV32I-NEXT: .LBB12_97:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, t5
+; RV32I-NEXT: beqz s10, .LBB12_99
+; RV32I-NEXT: # %bb.98:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB12_99:
+; RV32I-NEXT: bltu s11, t3, .LBB12_101
+; RV32I-NEXT: # %bb.100:
+; RV32I-NEXT: srl a3, t5, s11
+; RV32I-NEXT: bnez s11, .LBB12_102
+; RV32I-NEXT: j .LBB12_103
+; RV32I-NEXT: .LBB12_101:
+; RV32I-NEXT: srl a3, t1, ra
+; RV32I-NEXT: neg s10, s11
+; RV32I-NEXT: sll s10, t5, s10
+; RV32I-NEXT: or a3, a3, s10
+; RV32I-NEXT: beqz s11, .LBB12_103
+; RV32I-NEXT: .LBB12_102:
+; RV32I-NEXT: mv t1, a3
+; RV32I-NEXT: .LBB12_103:
+; RV32I-NEXT: bltu s11, t3, .LBB12_105
+; RV32I-NEXT: # %bb.104:
+; RV32I-NEXT: li t3, 0
+; RV32I-NEXT: bltu ra, t6, .LBB12_106
+; RV32I-NEXT: j .LBB12_107
+; RV32I-NEXT: .LBB12_105:
+; RV32I-NEXT: srl t3, t5, ra
+; RV32I-NEXT: bgeu ra, t6, .LBB12_107
+; RV32I-NEXT: .LBB12_106:
+; RV32I-NEXT: or t1, a5, s9
+; RV32I-NEXT: or t3, s7, s3
+; RV32I-NEXT: .LBB12_107:
+; RV32I-NEXT: li a5, 128
+; RV32I-NEXT: bnez ra, .LBB12_114
+; RV32I-NEXT: # %bb.108:
+; RV32I-NEXT: bgeu ra, t6, .LBB12_115
+; RV32I-NEXT: .LBB12_109:
+; RV32I-NEXT: bltu a4, a5, .LBB12_116
+; RV32I-NEXT: .LBB12_110:
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bnez a4, .LBB12_117
+; RV32I-NEXT: .LBB12_111:
+; RV32I-NEXT: bltu a4, a5, .LBB12_113
+; RV32I-NEXT: .LBB12_112:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: .LBB12_113:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli t0, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t3, t2, 16
+; RV32I-NEXT: srli s3, t2, 24
+; RV32I-NEXT: srli s1, s4, 16
+; RV32I-NEXT: srli a3, s4, 24
+; RV32I-NEXT: srli t6, a7, 16
+; RV32I-NEXT: srli s6, a7, 24
+; RV32I-NEXT: srli s5, a1, 16
+; RV32I-NEXT: srli s7, a1, 24
+; RV32I-NEXT: srli s8, ra, 16
+; RV32I-NEXT: srli s9, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s10, s0, t1
+; RV32I-NEXT: and s11, a0, t1
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli t0, s11, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t2, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, s4, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t2, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, a7, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s4, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a3, a1, t1
+; RV32I-NEXT: and a4, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb a1, 24(a2)
+; RV32I-NEXT: sb a3, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB12_114:
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: bltu ra, t6, .LBB12_109
+; RV32I-NEXT: .LBB12_115:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: bgeu a4, a5, .LBB12_110
+; RV32I-NEXT: .LBB12_116:
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t0, a3, a6
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a3, s8
+; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s5
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB12_111
+; RV32I-NEXT: .LBB12_117:
+; RV32I-NEXT: mv s0, t0
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t2, s2
+; RV32I-NEXT: bgeu a4, a5, .LBB12_112
+; RV32I-NEXT: j .LBB12_113
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or t0, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t3, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li a7, 64
+; RV64I-NEXT: slli t4, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a6, t1, t0
+; RV64I-NEXT: or t0, t5, t3
+; RV64I-NEXT: or a5, s0, t6
+; RV64I-NEXT: slli a5, a5, 5
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sll t3, t0, t5
+; RV64I-NEXT: bltu a5, a7, .LBB13_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: srl t6, t0, t1
+; RV64I-NEXT: j .LBB13_3
+; RV64I-NEXT: .LBB13_2:
+; RV64I-NEXT: srl t6, a6, a5
+; RV64I-NEXT: or t6, t6, t3
+; RV64I-NEXT: .LBB13_3:
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli t4, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a6
+; RV64I-NEXT: beqz a5, .LBB13_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB13_5:
+; RV64I-NEXT: or a4, t4, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a5, a7, .LBB13_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: srl t4, a3, t1
+; RV64I-NEXT: j .LBB13_8
+; RV64I-NEXT: .LBB13_7:
+; RV64I-NEXT: srl a0, t0, a5
+; RV64I-NEXT: srl t1, a4, a5
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t4, t1, t2
+; RV64I-NEXT: .LBB13_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a5, .LBB13_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t4
+; RV64I-NEXT: .LBB13_10:
+; RV64I-NEXT: sub t6, t1, a5
+; RV64I-NEXT: bltu a5, a7, .LBB13_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t4, 0
+; RV64I-NEXT: bgeu t6, a7, .LBB13_14
+; RV64I-NEXT: .LBB13_12:
+; RV64I-NEXT: sll t5, a6, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a6, s0
+; RV64I-NEXT: or s1, s0, t3
+; RV64I-NEXT: j .LBB13_15
+; RV64I-NEXT: .LBB13_13:
+; RV64I-NEXT: srl t4, a3, a5
+; RV64I-NEXT: bltu t6, a7, .LBB13_12
+; RV64I-NEXT: .LBB13_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: sll s1, a6, t3
+; RV64I-NEXT: .LBB13_15:
+; RV64I-NEXT: sub s0, a5, t1
+; RV64I-NEXT: mv t3, t0
+; RV64I-NEXT: beqz t6, .LBB13_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t3, s1
+; RV64I-NEXT: .LBB13_17:
+; RV64I-NEXT: bltu s0, a7, .LBB13_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: srl t6, t0, t6
+; RV64I-NEXT: bnez s0, .LBB13_20
+; RV64I-NEXT: j .LBB13_21
+; RV64I-NEXT: .LBB13_19:
+; RV64I-NEXT: srl t6, a6, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, t0, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB13_21
+; RV64I-NEXT: .LBB13_20:
+; RV64I-NEXT: mv a6, t6
+; RV64I-NEXT: .LBB13_21:
+; RV64I-NEXT: bltu s0, a7, .LBB13_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: li a7, 0
+; RV64I-NEXT: bltu a5, t1, .LBB13_24
+; RV64I-NEXT: j .LBB13_25
+; RV64I-NEXT: .LBB13_23:
+; RV64I-NEXT: srl a7, t0, s0
+; RV64I-NEXT: bgeu a5, t1, .LBB13_25
+; RV64I-NEXT: .LBB13_24:
+; RV64I-NEXT: or a6, t2, t5
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: .LBB13_25:
+; RV64I-NEXT: bnez a5, .LBB13_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a5, t1, .LBB13_28
+; RV64I-NEXT: .LBB13_27:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: .LBB13_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB13_29:
+; RV64I-NEXT: mv a4, a6
+; RV64I-NEXT: mv a3, a7
+; RV64I-NEXT: bgeu a5, t1, .LBB13_27
+; RV64I-NEXT: j .LBB13_28
+;
+; RV32I-LABEL: lshr_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a6, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a4, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t4, t4, t3
+; RV32I-NEXT: or t5, t6, t5
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t3, 32
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a1, a4, 16
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a4, t1, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: or t1, t5, t4
+; RV32I-NEXT: or t5, a4, a7
+; RV32I-NEXT: or a4, t2, t0
+; RV32I-NEXT: slli a4, a4, 5
+; RV32I-NEXT: srl s0, t1, a4
+; RV32I-NEXT: neg s6, a4
+; RV32I-NEXT: sll t4, t5, s6
+; RV32I-NEXT: bltu a4, t3, .LBB13_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, t5, a4
+; RV32I-NEXT: j .LBB13_3
+; RV32I-NEXT: .LBB13_2:
+; RV32I-NEXT: or a7, s0, t4
+; RV32I-NEXT: .LBB13_3:
+; RV32I-NEXT: or t0, a6, a3
+; RV32I-NEXT: or a6, a1, a5
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: beqz a4, .LBB13_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a1, a7
+; RV32I-NEXT: .LBB13_5:
+; RV32I-NEXT: srl a3, t0, a4
+; RV32I-NEXT: sll a5, a6, s6
+; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t3, .LBB13_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: srl a3, a6, a4
+; RV32I-NEXT: j .LBB13_8
+; RV32I-NEXT: .LBB13_7:
+; RV32I-NEXT: srl ra, t5, a4
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: .LBB13_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv a7, t0
+; RV32I-NEXT: beqz a4, .LBB13_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a7, a3
+; RV32I-NEXT: .LBB13_10:
+; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s7, t6, a4
+; RV32I-NEXT: bltu a4, t3, .LBB13_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: j .LBB13_13
+; RV32I-NEXT: .LBB13_12:
+; RV32I-NEXT: srl a5, a6, a4
+; RV32I-NEXT: .LBB13_13:
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t3, .LBB13_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: sll a3, t1, s7
+; RV32I-NEXT: j .LBB13_16
+; RV32I-NEXT: .LBB13_15:
+; RV32I-NEXT: sll t2, t1, s6
+; RV32I-NEXT: srl a3, t1, s10
+; RV32I-NEXT: or a3, a3, t4
+; RV32I-NEXT: .LBB13_16:
+; RV32I-NEXT: addi s9, a4, -64
+; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: beqz s7, .LBB13_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t4, a3
+; RV32I-NEXT: .LBB13_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB13_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: srl s2, t5, s9
+; RV32I-NEXT: j .LBB13_21
+; RV32I-NEXT: .LBB13_20:
+; RV32I-NEXT: sll a3, t5, s11
+; RV32I-NEXT: or s2, s0, a3
+; RV32I-NEXT: .LBB13_21:
+; RV32I-NEXT: lbu s1, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: beqz s9, .LBB13_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv s0, s2
+; RV32I-NEXT: .LBB13_23:
+; RV32I-NEXT: lbu s4, 9(a0)
+; RV32I-NEXT: lbu s2, 10(a0)
+; RV32I-NEXT: lbu s5, 13(a0)
+; RV32I-NEXT: lbu s8, 14(a0)
+; RV32I-NEXT: slli s3, s1, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB13_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: j .LBB13_26
+; RV32I-NEXT: .LBB13_25:
+; RV32I-NEXT: srl s1, t5, a4
+; RV32I-NEXT: .LBB13_26:
+; RV32I-NEXT: or s2, s3, s2
+; RV32I-NEXT: lbu ra, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or s8, a3, s8
+; RV32I-NEXT: bgeu a4, t6, .LBB13_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or s0, a7, t2
+; RV32I-NEXT: or s1, a5, t4
+; RV32I-NEXT: .LBB13_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: or a5, s4, ra
+; RV32I-NEXT: slli t4, s2, 16
+; RV32I-NEXT: or s2, s5, s3
+; RV32I-NEXT: slli s3, s8, 16
+; RV32I-NEXT: mv s4, t0
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: beqz a4, .LBB13_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv s4, s0
+; RV32I-NEXT: mv a7, s1
+; RV32I-NEXT: .LBB13_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s1, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s8, t2, 8
+; RV32I-NEXT: or t4, t4, a5
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: bltu a4, t6, .LBB13_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB13_32:
+; RV32I-NEXT: slli s3, ra, 8
+; RV32I-NEXT: or a5, s5, a3
+; RV32I-NEXT: lbu s5, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a3, s8, s0
+; RV32I-NEXT: srl s2, t4, a4
+; RV32I-NEXT: sll ra, t2, s6
+; RV32I-NEXT: bltu a4, t3, .LBB13_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s0, t2, a4
+; RV32I-NEXT: j .LBB13_35
+; RV32I-NEXT: .LBB13_34:
+; RV32I-NEXT: or s0, s2, ra
+; RV32I-NEXT: .LBB13_35:
+; RV32I-NEXT: or s3, s3, s5
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a4, .LBB13_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s0
+; RV32I-NEXT: .LBB13_37:
+; RV32I-NEXT: or s0, a5, s3
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: bltu a4, t3, .LBB13_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: srl a3, a0, a4
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: bnez a4, .LBB13_40
+; RV32I-NEXT: j .LBB13_41
+; RV32I-NEXT: .LBB13_39:
+; RV32I-NEXT: srl s8, t2, a4
+; RV32I-NEXT: srl a3, s0, a4
+; RV32I-NEXT: sll a5, a0, s6
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: beqz a4, .LBB13_41
+; RV32I-NEXT: .LBB13_40:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB13_41:
+; RV32I-NEXT: bltu a4, t3, .LBB13_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t3, .LBB13_45
+; RV32I-NEXT: .LBB13_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: bnez s7, .LBB13_46
+; RV32I-NEXT: j .LBB13_47
+; RV32I-NEXT: .LBB13_44:
+; RV32I-NEXT: srl s1, a0, a4
+; RV32I-NEXT: bltu s7, t3, .LBB13_43
+; RV32I-NEXT: .LBB13_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: beqz s7, .LBB13_47
+; RV32I-NEXT: .LBB13_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB13_47:
+; RV32I-NEXT: bltu s9, t3, .LBB13_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t2, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB13_50
+; RV32I-NEXT: j .LBB13_51
+; RV32I-NEXT: .LBB13_49:
+; RV32I-NEXT: sll a3, t2, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB13_51
+; RV32I-NEXT: .LBB13_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB13_51:
+; RV32I-NEXT: bltu s9, t3, .LBB13_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a4, t6, .LBB13_54
+; RV32I-NEXT: j .LBB13_55
+; RV32I-NEXT: .LBB13_53:
+; RV32I-NEXT: srl s7, t2, a4
+; RV32I-NEXT: bgeu a4, t6, .LBB13_55
+; RV32I-NEXT: .LBB13_54:
+; RV32I-NEXT: or s2, a5, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB13_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a4, .LBB13_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a5, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB13_57:
+; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a4
+; RV32I-NEXT: bltu a4, t6, .LBB13_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: .LBB13_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a5, t0, s3
+; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB13_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a3, t0, s2
+; RV32I-NEXT: j .LBB13_62
+; RV32I-NEXT: .LBB13_61:
+; RV32I-NEXT: sll s10, t0, s6
+; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: .LBB13_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a6
+; RV32I-NEXT: beqz s2, .LBB13_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB13_64:
+; RV32I-NEXT: bltu s1, t3, .LBB13_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a6, s1
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez s1, .LBB13_67
+; RV32I-NEXT: j .LBB13_68
+; RV32I-NEXT: .LBB13_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a6, a3
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz s1, .LBB13_68
+; RV32I-NEXT: .LBB13_67:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB13_68:
+; RV32I-NEXT: bltu s1, t3, .LBB13_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t3, .LBB13_72
+; RV32I-NEXT: .LBB13_70:
+; RV32I-NEXT: sll s6, t1, s6
+; RV32I-NEXT: srl a3, t1, s3
+; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB13_73
+; RV32I-NEXT: .LBB13_71:
+; RV32I-NEXT: srl s1, a6, s3
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB13_70
+; RV32I-NEXT: .LBB13_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: .LBB13_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, t5
+; RV32I-NEXT: beqz s2, .LBB13_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB13_75:
+; RV32I-NEXT: bltu s9, t3, .LBB13_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t0, s9
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: bnez s9, .LBB13_78
+; RV32I-NEXT: j .LBB13_79
+; RV32I-NEXT: .LBB13_77:
+; RV32I-NEXT: sll s3, t0, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t0, a3
+; RV32I-NEXT: sll s7, a6, s2
+; RV32I-NEXT: or a3, a3, s7
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: beqz s9, .LBB13_79
+; RV32I-NEXT: .LBB13_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB13_79:
+; RV32I-NEXT: bltu s2, t6, .LBB13_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB13_82
+; RV32I-NEXT: .LBB13_81:
+; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s3, a5, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB13_82:
+; RV32I-NEXT: addi ra, a4, -128
+; RV32I-NEXT: mv s5, t1
+; RV32I-NEXT: mv s6, t5
+; RV32I-NEXT: beqz s2, .LBB13_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s5, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB13_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, t5, s9
+; RV32I-NEXT: bltu ra, t3, .LBB13_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: srl a3, t5, ra
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: bnez ra, .LBB13_87
+; RV32I-NEXT: j .LBB13_88
+; RV32I-NEXT: .LBB13_86:
+; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: beqz ra, .LBB13_88
+; RV32I-NEXT: .LBB13_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB13_88:
+; RV32I-NEXT: bltu ra, t3, .LBB13_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: srl a3, a6, ra
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez ra, .LBB13_91
+; RV32I-NEXT: j .LBB13_92
+; RV32I-NEXT: .LBB13_90:
+; RV32I-NEXT: srl s2, t5, a4
+; RV32I-NEXT: sll a3, a6, s9
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz ra, .LBB13_92
+; RV32I-NEXT: .LBB13_91:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB13_92:
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: bltu ra, t3, .LBB13_95
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bgeu s10, t3, .LBB13_96
+; RV32I-NEXT: .LBB13_94:
+; RV32I-NEXT: sll s9, t1, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB13_97
+; RV32I-NEXT: .LBB13_95:
+; RV32I-NEXT: srl s7, a6, a4
+; RV32I-NEXT: bltu s10, t3, .LBB13_94
+; RV32I-NEXT: .LBB13_96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t1, s10
+; RV32I-NEXT: .LBB13_97:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, t5
+; RV32I-NEXT: beqz s10, .LBB13_99
+; RV32I-NEXT: # %bb.98:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB13_99:
+; RV32I-NEXT: bltu s11, t3, .LBB13_101
+; RV32I-NEXT: # %bb.100:
+; RV32I-NEXT: srl a3, t5, s11
+; RV32I-NEXT: bnez s11, .LBB13_102
+; RV32I-NEXT: j .LBB13_103
+; RV32I-NEXT: .LBB13_101:
+; RV32I-NEXT: srl a3, t1, ra
+; RV32I-NEXT: neg s10, s11
+; RV32I-NEXT: sll s10, t5, s10
+; RV32I-NEXT: or a3, a3, s10
+; RV32I-NEXT: beqz s11, .LBB13_103
+; RV32I-NEXT: .LBB13_102:
+; RV32I-NEXT: mv t1, a3
+; RV32I-NEXT: .LBB13_103:
+; RV32I-NEXT: bltu s11, t3, .LBB13_105
+; RV32I-NEXT: # %bb.104:
+; RV32I-NEXT: li t3, 0
+; RV32I-NEXT: bltu ra, t6, .LBB13_106
+; RV32I-NEXT: j .LBB13_107
+; RV32I-NEXT: .LBB13_105:
+; RV32I-NEXT: srl t3, t5, ra
+; RV32I-NEXT: bgeu ra, t6, .LBB13_107
+; RV32I-NEXT: .LBB13_106:
+; RV32I-NEXT: or t1, a5, s9
+; RV32I-NEXT: or t3, s7, s3
+; RV32I-NEXT: .LBB13_107:
+; RV32I-NEXT: li a5, 128
+; RV32I-NEXT: bnez ra, .LBB13_114
+; RV32I-NEXT: # %bb.108:
+; RV32I-NEXT: bgeu ra, t6, .LBB13_115
+; RV32I-NEXT: .LBB13_109:
+; RV32I-NEXT: bltu a4, a5, .LBB13_116
+; RV32I-NEXT: .LBB13_110:
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bnez a4, .LBB13_117
+; RV32I-NEXT: .LBB13_111:
+; RV32I-NEXT: bltu a4, a5, .LBB13_113
+; RV32I-NEXT: .LBB13_112:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: .LBB13_113:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli t0, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t3, t2, 16
+; RV32I-NEXT: srli s3, t2, 24
+; RV32I-NEXT: srli s1, s4, 16
+; RV32I-NEXT: srli a3, s4, 24
+; RV32I-NEXT: srli t6, a7, 16
+; RV32I-NEXT: srli s6, a7, 24
+; RV32I-NEXT: srli s5, a1, 16
+; RV32I-NEXT: srli s7, a1, 24
+; RV32I-NEXT: srli s8, ra, 16
+; RV32I-NEXT: srli s9, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s10, s0, t1
+; RV32I-NEXT: and s11, a0, t1
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli t0, s11, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t2, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, s4, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t2, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, a7, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s4, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a3, a1, t1
+; RV32I-NEXT: and a4, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb a1, 24(a2)
+; RV32I-NEXT: sb a3, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB13_114:
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: bltu ra, t6, .LBB13_109
+; RV32I-NEXT: .LBB13_115:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: bgeu a4, a5, .LBB13_110
+; RV32I-NEXT: .LBB13_116:
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t0, a3, a6
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a3, s8
+; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s5
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB13_111
+; RV32I-NEXT: .LBB13_117:
+; RV32I-NEXT: mv s0, t0
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t2, s2
+; RV32I-NEXT: bgeu a4, a5, .LBB13_112
+; RV32I-NEXT: j .LBB13_113
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or t0, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t3, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li a7, 64
+; RV64I-NEXT: slli t4, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a6, t1, t0
+; RV64I-NEXT: or t0, t5, t3
+; RV64I-NEXT: or a5, s0, t6
+; RV64I-NEXT: slli a5, a5, 6
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sll t3, t0, t5
+; RV64I-NEXT: bltu a5, a7, .LBB14_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: srl t6, t0, t1
+; RV64I-NEXT: j .LBB14_3
+; RV64I-NEXT: .LBB14_2:
+; RV64I-NEXT: srl t6, a6, a5
+; RV64I-NEXT: or t6, t6, t3
+; RV64I-NEXT: .LBB14_3:
+; RV64I-NEXT: or a3, t4, a3
+; RV64I-NEXT: slli t4, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a6
+; RV64I-NEXT: beqz a5, .LBB14_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB14_5:
+; RV64I-NEXT: or a4, t4, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a5, a7, .LBB14_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: srl t4, a3, t1
+; RV64I-NEXT: j .LBB14_8
+; RV64I-NEXT: .LBB14_7:
+; RV64I-NEXT: srl a0, t0, a5
+; RV64I-NEXT: srl t1, a4, a5
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t4, t1, t2
+; RV64I-NEXT: .LBB14_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a5, .LBB14_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t4
+; RV64I-NEXT: .LBB14_10:
+; RV64I-NEXT: sub t6, t1, a5
+; RV64I-NEXT: bltu a5, a7, .LBB14_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t4, 0
+; RV64I-NEXT: bgeu t6, a7, .LBB14_14
+; RV64I-NEXT: .LBB14_12:
+; RV64I-NEXT: sll t5, a6, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a6, s0
+; RV64I-NEXT: or s1, s0, t3
+; RV64I-NEXT: j .LBB14_15
+; RV64I-NEXT: .LBB14_13:
+; RV64I-NEXT: srl t4, a3, a5
+; RV64I-NEXT: bltu t6, a7, .LBB14_12
+; RV64I-NEXT: .LBB14_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: sll s1, a6, t3
+; RV64I-NEXT: .LBB14_15:
+; RV64I-NEXT: sub s0, a5, t1
+; RV64I-NEXT: mv t3, t0
+; RV64I-NEXT: beqz t6, .LBB14_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t3, s1
+; RV64I-NEXT: .LBB14_17:
+; RV64I-NEXT: bltu s0, a7, .LBB14_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: srl t6, t0, t6
+; RV64I-NEXT: bnez s0, .LBB14_20
+; RV64I-NEXT: j .LBB14_21
+; RV64I-NEXT: .LBB14_19:
+; RV64I-NEXT: srl t6, a6, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, t0, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB14_21
+; RV64I-NEXT: .LBB14_20:
+; RV64I-NEXT: mv a6, t6
+; RV64I-NEXT: .LBB14_21:
+; RV64I-NEXT: bltu s0, a7, .LBB14_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: li a7, 0
+; RV64I-NEXT: bltu a5, t1, .LBB14_24
+; RV64I-NEXT: j .LBB14_25
+; RV64I-NEXT: .LBB14_23:
+; RV64I-NEXT: srl a7, t0, s0
+; RV64I-NEXT: bgeu a5, t1, .LBB14_25
+; RV64I-NEXT: .LBB14_24:
+; RV64I-NEXT: or a6, t2, t5
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: .LBB14_25:
+; RV64I-NEXT: bnez a5, .LBB14_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a5, t1, .LBB14_28
+; RV64I-NEXT: .LBB14_27:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a0, 0
+; RV64I-NEXT: .LBB14_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB14_29:
+; RV64I-NEXT: mv a4, a6
+; RV64I-NEXT: mv a3, a7
+; RV64I-NEXT: bgeu a5, t1, .LBB14_27
+; RV64I-NEXT: j .LBB14_28
+;
+; RV32I-LABEL: lshr_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a6, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a4, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t4, t4, t3
+; RV32I-NEXT: or t5, t6, t5
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t3, 32
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a1, a4, 16
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a4, t1, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: or t1, t5, t4
+; RV32I-NEXT: or t5, a4, a7
+; RV32I-NEXT: or a4, t2, t0
+; RV32I-NEXT: slli a4, a4, 6
+; RV32I-NEXT: srl s0, t1, a4
+; RV32I-NEXT: neg s6, a4
+; RV32I-NEXT: sll t4, t5, s6
+; RV32I-NEXT: bltu a4, t3, .LBB14_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srl a7, t5, a4
+; RV32I-NEXT: j .LBB14_3
+; RV32I-NEXT: .LBB14_2:
+; RV32I-NEXT: or a7, s0, t4
+; RV32I-NEXT: .LBB14_3:
+; RV32I-NEXT: or t0, a6, a3
+; RV32I-NEXT: or a6, a1, a5
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: beqz a4, .LBB14_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a1, a7
+; RV32I-NEXT: .LBB14_5:
+; RV32I-NEXT: srl a3, t0, a4
+; RV32I-NEXT: sll a5, a6, s6
+; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t3, .LBB14_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: srl a3, a6, a4
+; RV32I-NEXT: j .LBB14_8
+; RV32I-NEXT: .LBB14_7:
+; RV32I-NEXT: srl ra, t5, a4
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: .LBB14_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv a7, t0
+; RV32I-NEXT: beqz a4, .LBB14_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a7, a3
+; RV32I-NEXT: .LBB14_10:
+; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s7, t6, a4
+; RV32I-NEXT: bltu a4, t3, .LBB14_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a5, 0
+; RV32I-NEXT: j .LBB14_13
+; RV32I-NEXT: .LBB14_12:
+; RV32I-NEXT: srl a5, a6, a4
+; RV32I-NEXT: .LBB14_13:
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t3, .LBB14_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li t2, 0
+; RV32I-NEXT: sll a3, t1, s7
+; RV32I-NEXT: j .LBB14_16
+; RV32I-NEXT: .LBB14_15:
+; RV32I-NEXT: sll t2, t1, s6
+; RV32I-NEXT: srl a3, t1, s10
+; RV32I-NEXT: or a3, a3, t4
+; RV32I-NEXT: .LBB14_16:
+; RV32I-NEXT: addi s9, a4, -64
+; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: beqz s7, .LBB14_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t4, a3
+; RV32I-NEXT: .LBB14_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB14_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: srl s2, t5, s9
+; RV32I-NEXT: j .LBB14_21
+; RV32I-NEXT: .LBB14_20:
+; RV32I-NEXT: sll a3, t5, s11
+; RV32I-NEXT: or s2, s0, a3
+; RV32I-NEXT: .LBB14_21:
+; RV32I-NEXT: lbu s1, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: beqz s9, .LBB14_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv s0, s2
+; RV32I-NEXT: .LBB14_23:
+; RV32I-NEXT: lbu s4, 9(a0)
+; RV32I-NEXT: lbu s2, 10(a0)
+; RV32I-NEXT: lbu s5, 13(a0)
+; RV32I-NEXT: lbu s8, 14(a0)
+; RV32I-NEXT: slli s3, s1, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t3, .LBB14_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: j .LBB14_26
+; RV32I-NEXT: .LBB14_25:
+; RV32I-NEXT: srl s1, t5, a4
+; RV32I-NEXT: .LBB14_26:
+; RV32I-NEXT: or s2, s3, s2
+; RV32I-NEXT: lbu ra, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or s8, a3, s8
+; RV32I-NEXT: bgeu a4, t6, .LBB14_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or s0, a7, t2
+; RV32I-NEXT: or s1, a5, t4
+; RV32I-NEXT: .LBB14_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: or a5, s4, ra
+; RV32I-NEXT: slli t4, s2, 16
+; RV32I-NEXT: or s2, s5, s3
+; RV32I-NEXT: slli s3, s8, 16
+; RV32I-NEXT: mv s4, t0
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: beqz a4, .LBB14_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv s4, s0
+; RV32I-NEXT: mv a7, s1
+; RV32I-NEXT: .LBB14_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s1, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s8, t2, 8
+; RV32I-NEXT: or t4, t4, a5
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: bltu a4, t6, .LBB14_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB14_32:
+; RV32I-NEXT: slli s3, ra, 8
+; RV32I-NEXT: or a5, s5, a3
+; RV32I-NEXT: lbu s5, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: or a3, s8, s0
+; RV32I-NEXT: srl s2, t4, a4
+; RV32I-NEXT: sll ra, t2, s6
+; RV32I-NEXT: bltu a4, t3, .LBB14_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s0, t2, a4
+; RV32I-NEXT: j .LBB14_35
+; RV32I-NEXT: .LBB14_34:
+; RV32I-NEXT: or s0, s2, ra
+; RV32I-NEXT: .LBB14_35:
+; RV32I-NEXT: or s3, s3, s5
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a4, .LBB14_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s0
+; RV32I-NEXT: .LBB14_37:
+; RV32I-NEXT: or s0, a5, s3
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: bltu a4, t3, .LBB14_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: srl a3, a0, a4
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: bnez a4, .LBB14_40
+; RV32I-NEXT: j .LBB14_41
+; RV32I-NEXT: .LBB14_39:
+; RV32I-NEXT: srl s8, t2, a4
+; RV32I-NEXT: srl a3, s0, a4
+; RV32I-NEXT: sll a5, a0, s6
+; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: beqz a4, .LBB14_41
+; RV32I-NEXT: .LBB14_40:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB14_41:
+; RV32I-NEXT: bltu a4, t3, .LBB14_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t3, .LBB14_45
+; RV32I-NEXT: .LBB14_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: bnez s7, .LBB14_46
+; RV32I-NEXT: j .LBB14_47
+; RV32I-NEXT: .LBB14_44:
+; RV32I-NEXT: srl s1, a0, a4
+; RV32I-NEXT: bltu s7, t3, .LBB14_43
+; RV32I-NEXT: .LBB14_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t2
+; RV32I-NEXT: beqz s7, .LBB14_47
+; RV32I-NEXT: .LBB14_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB14_47:
+; RV32I-NEXT: bltu s9, t3, .LBB14_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t2, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB14_50
+; RV32I-NEXT: j .LBB14_51
+; RV32I-NEXT: .LBB14_49:
+; RV32I-NEXT: sll a3, t2, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB14_51
+; RV32I-NEXT: .LBB14_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB14_51:
+; RV32I-NEXT: bltu s9, t3, .LBB14_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a4, t6, .LBB14_54
+; RV32I-NEXT: j .LBB14_55
+; RV32I-NEXT: .LBB14_53:
+; RV32I-NEXT: srl s7, t2, a4
+; RV32I-NEXT: bgeu a4, t6, .LBB14_55
+; RV32I-NEXT: .LBB14_54:
+; RV32I-NEXT: or s2, a5, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB14_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a5, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a4, .LBB14_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a5, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB14_57:
+; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a4
+; RV32I-NEXT: bltu a4, t6, .LBB14_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: .LBB14_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a5, t0, s3
+; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB14_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a3, t0, s2
+; RV32I-NEXT: j .LBB14_62
+; RV32I-NEXT: .LBB14_61:
+; RV32I-NEXT: sll s10, t0, s6
+; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: .LBB14_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a6
+; RV32I-NEXT: beqz s2, .LBB14_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB14_64:
+; RV32I-NEXT: bltu s1, t3, .LBB14_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a6, s1
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez s1, .LBB14_67
+; RV32I-NEXT: j .LBB14_68
+; RV32I-NEXT: .LBB14_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a6, a3
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz s1, .LBB14_68
+; RV32I-NEXT: .LBB14_67:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB14_68:
+; RV32I-NEXT: bltu s1, t3, .LBB14_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t3, .LBB14_72
+; RV32I-NEXT: .LBB14_70:
+; RV32I-NEXT: sll s6, t1, s6
+; RV32I-NEXT: srl a3, t1, s3
+; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB14_73
+; RV32I-NEXT: .LBB14_71:
+; RV32I-NEXT: srl s1, a6, s3
+; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t3, .LBB14_70
+; RV32I-NEXT: .LBB14_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: .LBB14_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, t5
+; RV32I-NEXT: beqz s2, .LBB14_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB14_75:
+; RV32I-NEXT: bltu s9, t3, .LBB14_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t0, s9
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: bnez s9, .LBB14_78
+; RV32I-NEXT: j .LBB14_79
+; RV32I-NEXT: .LBB14_77:
+; RV32I-NEXT: sll s3, t0, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t0, a3
+; RV32I-NEXT: sll s7, a6, s2
+; RV32I-NEXT: or a3, a3, s7
+; RV32I-NEXT: mv s7, a6
+; RV32I-NEXT: beqz s9, .LBB14_79
+; RV32I-NEXT: .LBB14_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB14_79:
+; RV32I-NEXT: bltu s2, t6, .LBB14_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB14_82
+; RV32I-NEXT: .LBB14_81:
+; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s3, a5, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB14_82:
+; RV32I-NEXT: addi ra, a4, -128
+; RV32I-NEXT: mv s5, t1
+; RV32I-NEXT: mv s6, t5
+; RV32I-NEXT: beqz s2, .LBB14_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s5, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB14_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, t5, s9
+; RV32I-NEXT: bltu ra, t3, .LBB14_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: srl a3, t5, ra
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: bnez ra, .LBB14_87
+; RV32I-NEXT: j .LBB14_88
+; RV32I-NEXT: .LBB14_86:
+; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t1
+; RV32I-NEXT: beqz ra, .LBB14_88
+; RV32I-NEXT: .LBB14_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB14_88:
+; RV32I-NEXT: bltu ra, t3, .LBB14_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: srl a3, a6, ra
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: bnez ra, .LBB14_91
+; RV32I-NEXT: j .LBB14_92
+; RV32I-NEXT: .LBB14_90:
+; RV32I-NEXT: srl s2, t5, a4
+; RV32I-NEXT: sll a3, a6, s9
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, t0
+; RV32I-NEXT: beqz ra, .LBB14_92
+; RV32I-NEXT: .LBB14_91:
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: .LBB14_92:
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: bltu ra, t3, .LBB14_95
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bgeu s10, t3, .LBB14_96
+; RV32I-NEXT: .LBB14_94:
+; RV32I-NEXT: sll s9, t1, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB14_97
+; RV32I-NEXT: .LBB14_95:
+; RV32I-NEXT: srl s7, a6, a4
+; RV32I-NEXT: bltu s10, t3, .LBB14_94
+; RV32I-NEXT: .LBB14_96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t1, s10
+; RV32I-NEXT: .LBB14_97:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, t5
+; RV32I-NEXT: beqz s10, .LBB14_99
+; RV32I-NEXT: # %bb.98:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB14_99:
+; RV32I-NEXT: bltu s11, t3, .LBB14_101
+; RV32I-NEXT: # %bb.100:
+; RV32I-NEXT: srl a3, t5, s11
+; RV32I-NEXT: bnez s11, .LBB14_102
+; RV32I-NEXT: j .LBB14_103
+; RV32I-NEXT: .LBB14_101:
+; RV32I-NEXT: srl a3, t1, ra
+; RV32I-NEXT: neg s10, s11
+; RV32I-NEXT: sll s10, t5, s10
+; RV32I-NEXT: or a3, a3, s10
+; RV32I-NEXT: beqz s11, .LBB14_103
+; RV32I-NEXT: .LBB14_102:
+; RV32I-NEXT: mv t1, a3
+; RV32I-NEXT: .LBB14_103:
+; RV32I-NEXT: bltu s11, t3, .LBB14_105
+; RV32I-NEXT: # %bb.104:
+; RV32I-NEXT: li t3, 0
+; RV32I-NEXT: bltu ra, t6, .LBB14_106
+; RV32I-NEXT: j .LBB14_107
+; RV32I-NEXT: .LBB14_105:
+; RV32I-NEXT: srl t3, t5, ra
+; RV32I-NEXT: bgeu ra, t6, .LBB14_107
+; RV32I-NEXT: .LBB14_106:
+; RV32I-NEXT: or t1, a5, s9
+; RV32I-NEXT: or t3, s7, s3
+; RV32I-NEXT: .LBB14_107:
+; RV32I-NEXT: li a5, 128
+; RV32I-NEXT: bnez ra, .LBB14_114
+; RV32I-NEXT: # %bb.108:
+; RV32I-NEXT: bgeu ra, t6, .LBB14_115
+; RV32I-NEXT: .LBB14_109:
+; RV32I-NEXT: bltu a4, a5, .LBB14_116
+; RV32I-NEXT: .LBB14_110:
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bnez a4, .LBB14_117
+; RV32I-NEXT: .LBB14_111:
+; RV32I-NEXT: bltu a4, a5, .LBB14_113
+; RV32I-NEXT: .LBB14_112:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: .LBB14_113:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli t0, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t3, t2, 16
+; RV32I-NEXT: srli s3, t2, 24
+; RV32I-NEXT: srli s1, s4, 16
+; RV32I-NEXT: srli a3, s4, 24
+; RV32I-NEXT: srli t6, a7, 16
+; RV32I-NEXT: srli s6, a7, 24
+; RV32I-NEXT: srli s5, a1, 16
+; RV32I-NEXT: srli s7, a1, 24
+; RV32I-NEXT: srli s8, ra, 16
+; RV32I-NEXT: srli s9, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s10, s0, t1
+; RV32I-NEXT: and s11, a0, t1
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli t0, s11, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t2, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, s4, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t2, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, a7, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s4, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a3, a1, t1
+; RV32I-NEXT: and a4, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb a1, 24(a2)
+; RV32I-NEXT: sb a3, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB14_114:
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: bltu ra, t6, .LBB14_109
+; RV32I-NEXT: .LBB14_115:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: bgeu a4, a5, .LBB14_110
+; RV32I-NEXT: .LBB14_116:
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t0, a3, a6
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a3, s8
+; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s5
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB14_111
+; RV32I-NEXT: .LBB14_117:
+; RV32I-NEXT: mv s0, t0
+; RV32I-NEXT: mv a0, a6
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t2, s2
+; RV32I-NEXT: bgeu a4, a5, .LBB14_112
+; RV32I-NEXT: j .LBB14_113
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -112
+; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 12(a0)
+; RV64I-NEXT: lbu s0, 13(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: lbu s2, 15(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or t6, t6, t5
+; RV64I-NEXT: or s0, s0, a4
+; RV64I-NEXT: or s1, s2, s1
+; RV64I-NEXT: lbu a4, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu t5, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: or t1, t3, t2
+; RV64I-NEXT: or t2, t4, a4
+; RV64I-NEXT: or a1, a1, t5
+; RV64I-NEXT: lbu t5, 19(a0)
+; RV64I-NEXT: lbu t4, 21(a0)
+; RV64I-NEXT: lbu a4, 22(a0)
+; RV64I-NEXT: lbu t3, 23(a0)
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t6, t6, 16
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: or s4, s3, a3
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a6, t6, a6
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: lbu t6, 29(a0)
+; RV64I-NEXT: lbu a3, 30(a0)
+; RV64I-NEXT: lbu s2, 31(a0)
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or s5, t1, t0
+; RV64I-NEXT: li a7, 128
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli s3, t3, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t1, a5, s4
+; RV64I-NEXT: or a5, s0, a6
+; RV64I-NEXT: or a6, a1, s5
+; RV64I-NEXT: slli a6, a6, 3
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: srl s0, t1, t3
+; RV64I-NEXT: bltu a6, t0, .LBB15_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sll s4, t1, t2
+; RV64I-NEXT: j .LBB15_3
+; RV64I-NEXT: .LBB15_2:
+; RV64I-NEXT: sll a1, t1, a6
+; RV64I-NEXT: sll s4, a5, a6
+; RV64I-NEXT: or s4, s0, s4
+; RV64I-NEXT: .LBB15_3:
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, s3, a4
+; RV64I-NEXT: lbu ra, 17(a0)
+; RV64I-NEXT: lbu s11, 18(a0)
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s5, 25(a0)
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: lbu s7, 26(a0)
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: slli s10, t6, 8
+; RV64I-NEXT: or s9, s2, a3
+; RV64I-NEXT: sub a4, a7, a6
+; RV64I-NEXT: mv a3, a5
+; RV64I-NEXT: beqz a6, .LBB15_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a3, s4
+; RV64I-NEXT: .LBB15_5:
+; RV64I-NEXT: slli t6, ra, 8
+; RV64I-NEXT: or t5, t5, s11
+; RV64I-NEXT: or t4, t4, s8
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: lbu s8, 16(a0)
+; RV64I-NEXT: lbu a0, 24(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s2, s1, s7
+; RV64I-NEXT: or s1, s10, s6
+; RV64I-NEXT: slli s4, s9, 16
+; RV64I-NEXT: bltu a4, t0, .LBB15_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: srl s0, a5, s0
+; RV64I-NEXT: j .LBB15_8
+; RV64I-NEXT: .LBB15_7:
+; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: sll s6, a5, s6
+; RV64I-NEXT: or s0, s0, s6
+; RV64I-NEXT: .LBB15_8:
+; RV64I-NEXT: or t6, t6, s8
+; RV64I-NEXT: slli s6, t5, 16
+; RV64I-NEXT: or s3, s3, t4
+; RV64I-NEXT: or t5, s5, a0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or s1, s4, s1
+; RV64I-NEXT: mv t4, t1
+; RV64I-NEXT: beqz a4, .LBB15_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t4, s0
+; RV64I-NEXT: .LBB15_10:
+; RV64I-NEXT: or a0, s6, t6
+; RV64I-NEXT: slli s0, s3, 32
+; RV64I-NEXT: or t6, s2, t5
+; RV64I-NEXT: slli s1, s1, 32
+; RV64I-NEXT: bltu a4, t0, .LBB15_12
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: j .LBB15_13
+; RV64I-NEXT: .LBB15_12:
+; RV64I-NEXT: srl t5, a5, t3
+; RV64I-NEXT: .LBB15_13:
+; RV64I-NEXT: or a4, s0, a0
+; RV64I-NEXT: or a0, s1, t6
+; RV64I-NEXT: bltu a6, t0, .LBB15_15
+; RV64I-NEXT: # %bb.14:
+; RV64I-NEXT: li t6, 0
+; RV64I-NEXT: sll t2, a4, t2
+; RV64I-NEXT: j .LBB15_16
+; RV64I-NEXT: .LBB15_15:
+; RV64I-NEXT: sll t6, a4, a6
+; RV64I-NEXT: srl t2, a4, t3
+; RV64I-NEXT: sll t3, a0, a6
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: .LBB15_16:
+; RV64I-NEXT: sub s0, a6, a7
+; RV64I-NEXT: mv t3, a0
+; RV64I-NEXT: beqz a6, .LBB15_18
+; RV64I-NEXT: # %bb.17:
+; RV64I-NEXT: mv t3, t2
+; RV64I-NEXT: .LBB15_18:
+; RV64I-NEXT: bltu s0, t0, .LBB15_20
+; RV64I-NEXT: # %bb.19:
+; RV64I-NEXT: li t2, 0
+; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: sll t0, t1, t0
+; RV64I-NEXT: bnez s0, .LBB15_21
+; RV64I-NEXT: j .LBB15_22
+; RV64I-NEXT: .LBB15_20:
+; RV64I-NEXT: sll t2, t1, s0
+; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: srl t0, t1, t0
+; RV64I-NEXT: sll t1, a5, s0
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: beqz s0, .LBB15_22
+; RV64I-NEXT: .LBB15_21:
+; RV64I-NEXT: mv a5, t0
+; RV64I-NEXT: .LBB15_22:
+; RV64I-NEXT: bltu a6, a7, .LBB15_24
+; RV64I-NEXT: # %bb.23:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a3, 0
+; RV64I-NEXT: bnez a6, .LBB15_25
+; RV64I-NEXT: j .LBB15_26
+; RV64I-NEXT: .LBB15_24:
+; RV64I-NEXT: or t2, t4, t6
+; RV64I-NEXT: or a5, t5, t3
+; RV64I-NEXT: beqz a6, .LBB15_26
+; RV64I-NEXT: .LBB15_25:
+; RV64I-NEXT: mv a4, t2
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB15_26:
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: srliw a6, a1, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a1, 24
+; RV64I-NEXT: srli t0, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a4, 32
+; RV64I-NEXT: srliw s2, a4, 16
+; RV64I-NEXT: srliw s6, a4, 24
+; RV64I-NEXT: srli s4, a4, 48
+; RV64I-NEXT: srli s7, a4, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a1, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a1, a5, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a1, a3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a1, a7, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a1, a4, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 112
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 5(a0)
+; RV32I-NEXT: lbu t2, 6(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a3
+; RV32I-NEXT: lbu a6, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: li s9, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: li t4, 32
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t3, a5, a4
+; RV32I-NEXT: or a5, t2, a7
+; RV32I-NEXT: or a4, a1, t0
+; RV32I-NEXT: slli a4, a4, 3
+; RV32I-NEXT: neg s10, a4
+; RV32I-NEXT: srl t5, t3, s10
+; RV32I-NEXT: sll s5, a5, a4
+; RV32I-NEXT: bltu a4, t4, .LBB15_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: sll a7, t3, a4
+; RV32I-NEXT: j .LBB15_3
+; RV32I-NEXT: .LBB15_2:
+; RV32I-NEXT: sll s8, t3, a4
+; RV32I-NEXT: or a7, t5, s5
+; RV32I-NEXT: .LBB15_3:
+; RV32I-NEXT: lbu t2, 9(a0)
+; RV32I-NEXT: lbu a1, 10(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t6, a3, 8
+; RV32I-NEXT: sub s6, s9, a4
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: beqz a4, .LBB15_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB15_5:
+; RV32I-NEXT: slli a7, t2, 8
+; RV32I-NEXT: or a6, a6, a1
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu a1, 12(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: neg t6, s6
+; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s6, t4, .LBB15_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t6, a5, s6
+; RV32I-NEXT: j .LBB15_8
+; RV32I-NEXT: .LBB15_7:
+; RV32I-NEXT: sll t6, a5, t6
+; RV32I-NEXT: or t6, t5, t6
+; RV32I-NEXT: .LBB15_8:
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t2, a6, 16
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: beqz s6, .LBB15_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a6, t6
+; RV32I-NEXT: .LBB15_10:
+; RV32I-NEXT: or t1, t2, a7
+; RV32I-NEXT: or t2, t0, a1
+; RV32I-NEXT: bltu s6, t4, .LBB15_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: j .LBB15_13
+; RV32I-NEXT: .LBB15_12:
+; RV32I-NEXT: srl a7, a5, s10
+; RV32I-NEXT: .LBB15_13:
+; RV32I-NEXT: srl s0, t1, s10
+; RV32I-NEXT: sll a1, t2, a4
+; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB15_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, a4
+; RV32I-NEXT: j .LBB15_16
+; RV32I-NEXT: .LBB15_15:
+; RV32I-NEXT: sll s1, t1, a4
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB15_16:
+; RV32I-NEXT: addi s7, a4, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz a4, .LBB15_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB15_18:
+; RV32I-NEXT: neg a1, s7
+; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t4, .LBB15_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: sll a1, t3, s7
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: bnez s7, .LBB15_21
+; RV32I-NEXT: j .LBB15_22
+; RV32I-NEXT: .LBB15_20:
+; RV32I-NEXT: sll s2, t3, a4
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: or a1, a1, s5
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: beqz s7, .LBB15_22
+; RV32I-NEXT: .LBB15_21:
+; RV32I-NEXT: mv s4, a1
+; RV32I-NEXT: .LBB15_22:
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bltu a4, s9, .LBB15_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: j .LBB15_25
+; RV32I-NEXT: .LBB15_24:
+; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, a6, s1
+; RV32I-NEXT: or s4, a7, s3
+; RV32I-NEXT: .LBB15_25:
+; RV32I-NEXT: sub ra, a1, a4
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: beqz a4, .LBB15_27
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: mv a7, s2
+; RV32I-NEXT: mv a6, s4
+; RV32I-NEXT: .LBB15_27:
+; RV32I-NEXT: neg s1, ra
+; RV32I-NEXT: sll s2, t2, s1
+; RV32I-NEXT: bltu ra, t4, .LBB15_29
+; RV32I-NEXT: # %bb.28:
+; RV32I-NEXT: srl a1, t2, ra
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bnez ra, .LBB15_30
+; RV32I-NEXT: j .LBB15_31
+; RV32I-NEXT: .LBB15_29:
+; RV32I-NEXT: or a1, s0, s2
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: beqz ra, .LBB15_31
+; RV32I-NEXT: .LBB15_30:
+; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB15_31:
+; RV32I-NEXT: bltu ra, t4, .LBB15_33
+; RV32I-NEXT: # %bb.32:
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: srl a1, a5, ra
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: bnez ra, .LBB15_34
+; RV32I-NEXT: j .LBB15_35
+; RV32I-NEXT: .LBB15_33:
+; RV32I-NEXT: srl a1, t2, s10
+; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, a5, s1
+; RV32I-NEXT: or a1, t5, a1
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: beqz ra, .LBB15_35
+; RV32I-NEXT: .LBB15_34:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB15_35:
+; RV32I-NEXT: sub s3, s9, ra
+; RV32I-NEXT: bltu ra, t4, .LBB15_38
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s3, t4, .LBB15_39
+; RV32I-NEXT: .LBB15_37:
+; RV32I-NEXT: sll s1, t1, s1
+; RV32I-NEXT: neg a1, s3
+; RV32I-NEXT: srl a1, t1, a1
+; RV32I-NEXT: or a1, a1, s2
+; RV32I-NEXT: j .LBB15_40
+; RV32I-NEXT: .LBB15_38:
+; RV32I-NEXT: srl a1, a5, s10
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s3, t4, .LBB15_37
+; RV32I-NEXT: .LBB15_39:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, s3
+; RV32I-NEXT: .LBB15_40:
+; RV32I-NEXT: addi s4, ra, -64
+; RV32I-NEXT: mv s2, t2
+; RV32I-NEXT: beqz s3, .LBB15_42
+; RV32I-NEXT: # %bb.41:
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: .LBB15_42:
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s5, a7
+; RV32I-NEXT: bltu s4, t4, .LBB15_44
+; RV32I-NEXT: # %bb.43:
+; RV32I-NEXT: srl t0, t2, s4
+; RV32I-NEXT: j .LBB15_45
+; RV32I-NEXT: .LBB15_44:
+; RV32I-NEXT: srl a1, t1, ra
+; RV32I-NEXT: neg t0, s4
+; RV32I-NEXT: sll t0, t2, t0
+; RV32I-NEXT: or t0, a1, t0
+; RV32I-NEXT: .LBB15_45:
+; RV32I-NEXT: mv s0, s10
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: lbu s8, 19(a0)
+; RV32I-NEXT: lbu a1, 23(a0)
+; RV32I-NEXT: mv s3, t1
+; RV32I-NEXT: beqz s4, .LBB15_47
+; RV32I-NEXT: # %bb.46:
+; RV32I-NEXT: mv s3, t0
+; RV32I-NEXT: .LBB15_47:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: lbu s10, 17(a0)
+; RV32I-NEXT: lbu t0, 18(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu t6, 22(a0)
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: li a3, 64
+; RV32I-NEXT: bltu s4, t4, .LBB15_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB15_50
+; RV32I-NEXT: .LBB15_49:
+; RV32I-NEXT: srl s4, t2, ra
+; RV32I-NEXT: .LBB15_50:
+; RV32I-NEXT: or s11, s8, t0
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t6, a1, t6
+; RV32I-NEXT: bgeu ra, a3, .LBB15_52
+; RV32I-NEXT: # %bb.51:
+; RV32I-NEXT: or s3, t5, s1
+; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s4, a1, s2
+; RV32I-NEXT: .LBB15_52:
+; RV32I-NEXT: or a1, s10, t0
+; RV32I-NEXT: slli s11, s11, 16
+; RV32I-NEXT: or t0, s9, s8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: mv s1, a5
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: beqz ra, .LBB15_54
+; RV32I-NEXT: # %bb.53:
+; RV32I-NEXT: mv t5, s3
+; RV32I-NEXT: mv s1, s4
+; RV32I-NEXT: .LBB15_54:
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, s11, a1
+; RV32I-NEXT: or s1, t6, t0
+; RV32I-NEXT: li a1, 64
+; RV32I-NEXT: mv a6, a7
+; RV32I-NEXT: mv a7, s0
+; RV32I-NEXT: bltu ra, a1, .LBB15_56
+; RV32I-NEXT: # %bb.55:
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB15_56:
+; RV32I-NEXT: srl s3, s2, a7
+; RV32I-NEXT: sll ra, s1, a4
+; RV32I-NEXT: mv a7, s5
+; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB15_58
+; RV32I-NEXT: # %bb.57:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: j .LBB15_59
+; RV32I-NEXT: .LBB15_58:
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a1, s3, ra
+; RV32I-NEXT: .LBB15_59:
+; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: lbu t6, 31(a0)
+; RV32I-NEXT: mv t5, s1
+; RV32I-NEXT: beqz a4, .LBB15_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB15_61:
+; RV32I-NEXT: lbu s8, 25(a0)
+; RV32I-NEXT: lbu s4, 26(a0)
+; RV32I-NEXT: lbu s11, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: bltu s6, t4, .LBB15_63
+; RV32I-NEXT: # %bb.62:
+; RV32I-NEXT: srl t0, s1, s6
+; RV32I-NEXT: j .LBB15_64
+; RV32I-NEXT: .LBB15_63:
+; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sll a1, s1, a1
+; RV32I-NEXT: or t0, s3, a1
+; RV32I-NEXT: .LBB15_64:
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: lbu s3, 24(a0)
+; RV32I-NEXT: lbu a1, 28(a0)
+; RV32I-NEXT: or s4, s9, s4
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t6, t6, s10
+; RV32I-NEXT: mv s9, s2
+; RV32I-NEXT: beqz s6, .LBB15_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: mv s9, t0
+; RV32I-NEXT: .LBB15_66:
+; RV32I-NEXT: or a0, s8, s3
+; RV32I-NEXT: slli t0, s4, 16
+; RV32I-NEXT: or a1, s11, a1
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: bltu s6, t4, .LBB15_68
+; RV32I-NEXT: # %bb.67:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB15_69
+; RV32I-NEXT: .LBB15_68:
+; RV32I-NEXT: srl s4, s1, s0
+; RV32I-NEXT: .LBB15_69:
+; RV32I-NEXT: li s11, 64
+; RV32I-NEXT: or s6, t0, a0
+; RV32I-NEXT: or a0, t6, a1
+; RV32I-NEXT: bltu a4, t4, .LBB15_71
+; RV32I-NEXT: # %bb.70:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a1, s6, a4
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: bnez a4, .LBB15_72
+; RV32I-NEXT: j .LBB15_73
+; RV32I-NEXT: .LBB15_71:
+; RV32I-NEXT: sll s3, s6, a4
+; RV32I-NEXT: srl a1, s6, s0
+; RV32I-NEXT: sll t0, a0, a4
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: beqz a4, .LBB15_73
+; RV32I-NEXT: .LBB15_72:
+; RV32I-NEXT: mv s10, a1
+; RV32I-NEXT: .LBB15_73:
+; RV32I-NEXT: bltu s7, t4, .LBB15_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, s2, s7
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: bnez s7, .LBB15_76
+; RV32I-NEXT: j .LBB15_77
+; RV32I-NEXT: .LBB15_75:
+; RV32I-NEXT: sll s5, s2, a4
+; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srl a1, s2, a1
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: beqz s7, .LBB15_77
+; RV32I-NEXT: .LBB15_76:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB15_77:
+; RV32I-NEXT: bltu a4, s11, .LBB15_79
+; RV32I-NEXT: # %bb.78:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB15_80
+; RV32I-NEXT: .LBB15_79:
+; RV32I-NEXT: or s5, s9, s3
+; RV32I-NEXT: or s0, s4, s10
+; RV32I-NEXT: .LBB15_80:
+; RV32I-NEXT: addi s9, a4, -128
+; RV32I-NEXT: mv s7, s6
+; RV32I-NEXT: mv s8, a0
+; RV32I-NEXT: beqz a4, .LBB15_82
+; RV32I-NEXT: # %bb.81:
+; RV32I-NEXT: mv s7, s5
+; RV32I-NEXT: mv s8, s0
+; RV32I-NEXT: .LBB15_82:
+; RV32I-NEXT: neg s3, s9
+; RV32I-NEXT: srl s0, t3, s3
+; RV32I-NEXT: bltu s9, t4, .LBB15_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, t3, s9
+; RV32I-NEXT: j .LBB15_85
+; RV32I-NEXT: .LBB15_84:
+; RV32I-NEXT: sll s5, t3, a4
+; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB15_85:
+; RV32I-NEXT: sub s4, s11, s9
+; RV32I-NEXT: mv t6, a5
+; RV32I-NEXT: beqz s9, .LBB15_87
+; RV32I-NEXT: # %bb.86:
+; RV32I-NEXT: mv t6, a1
+; RV32I-NEXT: .LBB15_87:
+; RV32I-NEXT: bltu s4, t4, .LBB15_89
+; RV32I-NEXT: # %bb.88:
+; RV32I-NEXT: srl a1, a5, s4
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: bnez s4, .LBB15_90
+; RV32I-NEXT: j .LBB15_91
+; RV32I-NEXT: .LBB15_89:
+; RV32I-NEXT: neg a1, s4
+; RV32I-NEXT: sll a1, a5, a1
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: beqz s4, .LBB15_91
+; RV32I-NEXT: .LBB15_90:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB15_91:
+; RV32I-NEXT: bltu s4, t4, .LBB15_94
+; RV32I-NEXT: # %bb.92:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bgeu s9, t4, .LBB15_95
+; RV32I-NEXT: .LBB15_93:
+; RV32I-NEXT: sll s10, t1, a4
+; RV32I-NEXT: srl a1, t1, s3
+; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: j .LBB15_96
+; RV32I-NEXT: .LBB15_94:
+; RV32I-NEXT: srl s4, a5, s3
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bltu s9, t4, .LBB15_93
+; RV32I-NEXT: .LBB15_95:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a1, t1, s9
+; RV32I-NEXT: .LBB15_96:
+; RV32I-NEXT: addi s11, s9, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz s9, .LBB15_98
+; RV32I-NEXT: # %bb.97:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB15_98:
+; RV32I-NEXT: bltu s11, t4, .LBB15_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: sll a1, t3, s11
+; RV32I-NEXT: bnez s11, .LBB15_101
+; RV32I-NEXT: j .LBB15_102
+; RV32I-NEXT: .LBB15_100:
+; RV32I-NEXT: sll t4, t3, s9
+; RV32I-NEXT: neg a1, s11
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: sll t0, a5, s9
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: beqz s11, .LBB15_102
+; RV32I-NEXT: .LBB15_101:
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: .LBB15_102:
+; RV32I-NEXT: bltu s9, ra, .LBB15_104
+; RV32I-NEXT: # %bb.103:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bnez s9, .LBB15_105
+; RV32I-NEXT: j .LBB15_106
+; RV32I-NEXT: .LBB15_104:
+; RV32I-NEXT: or t4, s0, s10
+; RV32I-NEXT: or a5, s4, s3
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: beqz s9, .LBB15_106
+; RV32I-NEXT: .LBB15_105:
+; RV32I-NEXT: mv t1, t4
+; RV32I-NEXT: mv t2, a5
+; RV32I-NEXT: .LBB15_106:
+; RV32I-NEXT: bltu a4, a1, .LBB15_108
+; RV32I-NEXT: # %bb.107:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: bnez a4, .LBB15_109
+; RV32I-NEXT: j .LBB15_110
+; RV32I-NEXT: .LBB15_108:
+; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s5, a1, a5
+; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t6, a1, t5
+; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a1, s7
+; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t2, a1, s8
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB15_110
+; RV32I-NEXT: .LBB15_109:
+; RV32I-NEXT: mv s2, s5
+; RV32I-NEXT: mv s1, t6
+; RV32I-NEXT: mv s6, t1
+; RV32I-NEXT: mv a0, t2
+; RV32I-NEXT: .LBB15_110:
+; RV32I-NEXT: srli a4, ra, 16
+; RV32I-NEXT: lui t2, 16
+; RV32I-NEXT: srli t1, ra, 24
+; RV32I-NEXT: srli a5, a3, 16
+; RV32I-NEXT: srli t4, a3, 24
+; RV32I-NEXT: srli t0, a7, 16
+; RV32I-NEXT: srli s0, a7, 24
+; RV32I-NEXT: srli t3, a6, 16
+; RV32I-NEXT: srli s3, a6, 24
+; RV32I-NEXT: srli t6, s2, 16
+; RV32I-NEXT: srli a1, s2, 24
+; RV32I-NEXT: srli t5, s1, 16
+; RV32I-NEXT: srli s5, s1, 24
+; RV32I-NEXT: srli s4, s6, 16
+; RV32I-NEXT: srli s7, s6, 24
+; RV32I-NEXT: srli s8, a0, 16
+; RV32I-NEXT: srli s9, a0, 24
+; RV32I-NEXT: addi t2, t2, -1
+; RV32I-NEXT: and s10, ra, t2
+; RV32I-NEXT: and s11, a3, t2
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb ra, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t1, 3(a2)
+; RV32I-NEXT: and a4, a7, t2
+; RV32I-NEXT: srli t1, s11, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb t1, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t4, 7(a2)
+; RV32I-NEXT: and a3, a6, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb t0, 10(a2)
+; RV32I-NEXT: sb s0, 11(a2)
+; RV32I-NEXT: and a4, s2, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a3, s1, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s2, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb t6, 18(a2)
+; RV32I-NEXT: sb a1, 19(a2)
+; RV32I-NEXT: and a1, s6, t2
+; RV32I-NEXT: and a4, a0, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s1, 20(a2)
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: sb t5, 22(a2)
+; RV32I-NEXT: sb s5, 23(a2)
+; RV32I-NEXT: sb s6, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s4, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb a0, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -112
+; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 12(a0)
+; RV64I-NEXT: lbu s0, 13(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: lbu s2, 15(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or t6, t6, t5
+; RV64I-NEXT: or s0, s0, a4
+; RV64I-NEXT: or s1, s2, s1
+; RV64I-NEXT: lbu a4, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu t5, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: or t1, t3, t2
+; RV64I-NEXT: or t2, t4, a4
+; RV64I-NEXT: or a1, a1, t5
+; RV64I-NEXT: lbu t5, 19(a0)
+; RV64I-NEXT: lbu t4, 21(a0)
+; RV64I-NEXT: lbu a4, 22(a0)
+; RV64I-NEXT: lbu t3, 23(a0)
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t6, t6, 16
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: or s4, s3, a3
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a6, t6, a6
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: lbu t6, 29(a0)
+; RV64I-NEXT: lbu a3, 30(a0)
+; RV64I-NEXT: lbu s2, 31(a0)
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or s5, t1, t0
+; RV64I-NEXT: li a7, 128
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli s3, t3, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t1, a5, s4
+; RV64I-NEXT: or a5, s0, a6
+; RV64I-NEXT: or a6, a1, s5
+; RV64I-NEXT: slli a6, a6, 5
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: srl s0, t1, t3
+; RV64I-NEXT: bltu a6, t0, .LBB16_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sll s4, t1, t2
+; RV64I-NEXT: j .LBB16_3
+; RV64I-NEXT: .LBB16_2:
+; RV64I-NEXT: sll a1, t1, a6
+; RV64I-NEXT: sll s4, a5, a6
+; RV64I-NEXT: or s4, s0, s4
+; RV64I-NEXT: .LBB16_3:
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, s3, a4
+; RV64I-NEXT: lbu ra, 17(a0)
+; RV64I-NEXT: lbu s11, 18(a0)
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s5, 25(a0)
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: lbu s7, 26(a0)
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: slli s10, t6, 8
+; RV64I-NEXT: or s9, s2, a3
+; RV64I-NEXT: sub a4, a7, a6
+; RV64I-NEXT: mv a3, a5
+; RV64I-NEXT: beqz a6, .LBB16_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a3, s4
+; RV64I-NEXT: .LBB16_5:
+; RV64I-NEXT: slli t6, ra, 8
+; RV64I-NEXT: or t5, t5, s11
+; RV64I-NEXT: or t4, t4, s8
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: lbu s8, 16(a0)
+; RV64I-NEXT: lbu a0, 24(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s2, s1, s7
+; RV64I-NEXT: or s1, s10, s6
+; RV64I-NEXT: slli s4, s9, 16
+; RV64I-NEXT: bltu a4, t0, .LBB16_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: srl s0, a5, s0
+; RV64I-NEXT: j .LBB16_8
+; RV64I-NEXT: .LBB16_7:
+; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: sll s6, a5, s6
+; RV64I-NEXT: or s0, s0, s6
+; RV64I-NEXT: .LBB16_8:
+; RV64I-NEXT: or t6, t6, s8
+; RV64I-NEXT: slli s6, t5, 16
+; RV64I-NEXT: or s3, s3, t4
+; RV64I-NEXT: or t5, s5, a0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or s1, s4, s1
+; RV64I-NEXT: mv t4, t1
+; RV64I-NEXT: beqz a4, .LBB16_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t4, s0
+; RV64I-NEXT: .LBB16_10:
+; RV64I-NEXT: or a0, s6, t6
+; RV64I-NEXT: slli s0, s3, 32
+; RV64I-NEXT: or t6, s2, t5
+; RV64I-NEXT: slli s1, s1, 32
+; RV64I-NEXT: bltu a4, t0, .LBB16_12
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: j .LBB16_13
+; RV64I-NEXT: .LBB16_12:
+; RV64I-NEXT: srl t5, a5, t3
+; RV64I-NEXT: .LBB16_13:
+; RV64I-NEXT: or a4, s0, a0
+; RV64I-NEXT: or a0, s1, t6
+; RV64I-NEXT: bltu a6, t0, .LBB16_15
+; RV64I-NEXT: # %bb.14:
+; RV64I-NEXT: li t6, 0
+; RV64I-NEXT: sll t2, a4, t2
+; RV64I-NEXT: j .LBB16_16
+; RV64I-NEXT: .LBB16_15:
+; RV64I-NEXT: sll t6, a4, a6
+; RV64I-NEXT: srl t2, a4, t3
+; RV64I-NEXT: sll t3, a0, a6
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: .LBB16_16:
+; RV64I-NEXT: sub s0, a6, a7
+; RV64I-NEXT: mv t3, a0
+; RV64I-NEXT: beqz a6, .LBB16_18
+; RV64I-NEXT: # %bb.17:
+; RV64I-NEXT: mv t3, t2
+; RV64I-NEXT: .LBB16_18:
+; RV64I-NEXT: bltu s0, t0, .LBB16_20
+; RV64I-NEXT: # %bb.19:
+; RV64I-NEXT: li t2, 0
+; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: sll t0, t1, t0
+; RV64I-NEXT: bnez s0, .LBB16_21
+; RV64I-NEXT: j .LBB16_22
+; RV64I-NEXT: .LBB16_20:
+; RV64I-NEXT: sll t2, t1, s0
+; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: srl t0, t1, t0
+; RV64I-NEXT: sll t1, a5, s0
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: beqz s0, .LBB16_22
+; RV64I-NEXT: .LBB16_21:
+; RV64I-NEXT: mv a5, t0
+; RV64I-NEXT: .LBB16_22:
+; RV64I-NEXT: bltu a6, a7, .LBB16_24
+; RV64I-NEXT: # %bb.23:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a3, 0
+; RV64I-NEXT: bnez a6, .LBB16_25
+; RV64I-NEXT: j .LBB16_26
+; RV64I-NEXT: .LBB16_24:
+; RV64I-NEXT: or t2, t4, t6
+; RV64I-NEXT: or a5, t5, t3
+; RV64I-NEXT: beqz a6, .LBB16_26
+; RV64I-NEXT: .LBB16_25:
+; RV64I-NEXT: mv a4, t2
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB16_26:
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: srliw a6, a1, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a1, 24
+; RV64I-NEXT: srli t0, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a4, 32
+; RV64I-NEXT: srliw s2, a4, 16
+; RV64I-NEXT: srliw s6, a4, 24
+; RV64I-NEXT: srli s4, a4, 48
+; RV64I-NEXT: srli s7, a4, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a1, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a1, a5, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a1, a3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a1, a7, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a1, a4, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 112
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 5(a0)
+; RV32I-NEXT: lbu t2, 6(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a3
+; RV32I-NEXT: lbu a6, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: li s9, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: li t4, 32
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t3, a5, a4
+; RV32I-NEXT: or a5, t2, a7
+; RV32I-NEXT: or a4, a1, t0
+; RV32I-NEXT: slli a4, a4, 5
+; RV32I-NEXT: neg s10, a4
+; RV32I-NEXT: srl t5, t3, s10
+; RV32I-NEXT: sll s5, a5, a4
+; RV32I-NEXT: bltu a4, t4, .LBB16_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: sll a7, t3, a4
+; RV32I-NEXT: j .LBB16_3
+; RV32I-NEXT: .LBB16_2:
+; RV32I-NEXT: sll s8, t3, a4
+; RV32I-NEXT: or a7, t5, s5
+; RV32I-NEXT: .LBB16_3:
+; RV32I-NEXT: lbu t2, 9(a0)
+; RV32I-NEXT: lbu a1, 10(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t6, a3, 8
+; RV32I-NEXT: sub s6, s9, a4
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: beqz a4, .LBB16_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB16_5:
+; RV32I-NEXT: slli a7, t2, 8
+; RV32I-NEXT: or a6, a6, a1
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu a1, 12(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: neg t6, s6
+; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s6, t4, .LBB16_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t6, a5, s6
+; RV32I-NEXT: j .LBB16_8
+; RV32I-NEXT: .LBB16_7:
+; RV32I-NEXT: sll t6, a5, t6
+; RV32I-NEXT: or t6, t5, t6
+; RV32I-NEXT: .LBB16_8:
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t2, a6, 16
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: beqz s6, .LBB16_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a6, t6
+; RV32I-NEXT: .LBB16_10:
+; RV32I-NEXT: or t1, t2, a7
+; RV32I-NEXT: or t2, t0, a1
+; RV32I-NEXT: bltu s6, t4, .LBB16_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: j .LBB16_13
+; RV32I-NEXT: .LBB16_12:
+; RV32I-NEXT: srl a7, a5, s10
+; RV32I-NEXT: .LBB16_13:
+; RV32I-NEXT: srl s0, t1, s10
+; RV32I-NEXT: sll a1, t2, a4
+; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB16_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, a4
+; RV32I-NEXT: j .LBB16_16
+; RV32I-NEXT: .LBB16_15:
+; RV32I-NEXT: sll s1, t1, a4
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB16_16:
+; RV32I-NEXT: addi s7, a4, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz a4, .LBB16_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB16_18:
+; RV32I-NEXT: neg a1, s7
+; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t4, .LBB16_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: sll a1, t3, s7
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: bnez s7, .LBB16_21
+; RV32I-NEXT: j .LBB16_22
+; RV32I-NEXT: .LBB16_20:
+; RV32I-NEXT: sll s2, t3, a4
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: or a1, a1, s5
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: beqz s7, .LBB16_22
+; RV32I-NEXT: .LBB16_21:
+; RV32I-NEXT: mv s4, a1
+; RV32I-NEXT: .LBB16_22:
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bltu a4, s9, .LBB16_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: j .LBB16_25
+; RV32I-NEXT: .LBB16_24:
+; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, a6, s1
+; RV32I-NEXT: or s4, a7, s3
+; RV32I-NEXT: .LBB16_25:
+; RV32I-NEXT: sub ra, a1, a4
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: beqz a4, .LBB16_27
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: mv a7, s2
+; RV32I-NEXT: mv a6, s4
+; RV32I-NEXT: .LBB16_27:
+; RV32I-NEXT: neg s1, ra
+; RV32I-NEXT: sll s2, t2, s1
+; RV32I-NEXT: bltu ra, t4, .LBB16_29
+; RV32I-NEXT: # %bb.28:
+; RV32I-NEXT: srl a1, t2, ra
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bnez ra, .LBB16_30
+; RV32I-NEXT: j .LBB16_31
+; RV32I-NEXT: .LBB16_29:
+; RV32I-NEXT: or a1, s0, s2
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: beqz ra, .LBB16_31
+; RV32I-NEXT: .LBB16_30:
+; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB16_31:
+; RV32I-NEXT: bltu ra, t4, .LBB16_33
+; RV32I-NEXT: # %bb.32:
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: srl a1, a5, ra
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: bnez ra, .LBB16_34
+; RV32I-NEXT: j .LBB16_35
+; RV32I-NEXT: .LBB16_33:
+; RV32I-NEXT: srl a1, t2, s10
+; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, a5, s1
+; RV32I-NEXT: or a1, t5, a1
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: beqz ra, .LBB16_35
+; RV32I-NEXT: .LBB16_34:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB16_35:
+; RV32I-NEXT: sub s3, s9, ra
+; RV32I-NEXT: bltu ra, t4, .LBB16_38
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s3, t4, .LBB16_39
+; RV32I-NEXT: .LBB16_37:
+; RV32I-NEXT: sll s1, t1, s1
+; RV32I-NEXT: neg a1, s3
+; RV32I-NEXT: srl a1, t1, a1
+; RV32I-NEXT: or a1, a1, s2
+; RV32I-NEXT: j .LBB16_40
+; RV32I-NEXT: .LBB16_38:
+; RV32I-NEXT: srl a1, a5, s10
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s3, t4, .LBB16_37
+; RV32I-NEXT: .LBB16_39:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, s3
+; RV32I-NEXT: .LBB16_40:
+; RV32I-NEXT: addi s4, ra, -64
+; RV32I-NEXT: mv s2, t2
+; RV32I-NEXT: beqz s3, .LBB16_42
+; RV32I-NEXT: # %bb.41:
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: .LBB16_42:
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s5, a7
+; RV32I-NEXT: bltu s4, t4, .LBB16_44
+; RV32I-NEXT: # %bb.43:
+; RV32I-NEXT: srl t0, t2, s4
+; RV32I-NEXT: j .LBB16_45
+; RV32I-NEXT: .LBB16_44:
+; RV32I-NEXT: srl a1, t1, ra
+; RV32I-NEXT: neg t0, s4
+; RV32I-NEXT: sll t0, t2, t0
+; RV32I-NEXT: or t0, a1, t0
+; RV32I-NEXT: .LBB16_45:
+; RV32I-NEXT: mv s0, s10
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: lbu s8, 19(a0)
+; RV32I-NEXT: lbu a1, 23(a0)
+; RV32I-NEXT: mv s3, t1
+; RV32I-NEXT: beqz s4, .LBB16_47
+; RV32I-NEXT: # %bb.46:
+; RV32I-NEXT: mv s3, t0
+; RV32I-NEXT: .LBB16_47:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: lbu s10, 17(a0)
+; RV32I-NEXT: lbu t0, 18(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu t6, 22(a0)
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: li a3, 64
+; RV32I-NEXT: bltu s4, t4, .LBB16_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB16_50
+; RV32I-NEXT: .LBB16_49:
+; RV32I-NEXT: srl s4, t2, ra
+; RV32I-NEXT: .LBB16_50:
+; RV32I-NEXT: or s11, s8, t0
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t6, a1, t6
+; RV32I-NEXT: bgeu ra, a3, .LBB16_52
+; RV32I-NEXT: # %bb.51:
+; RV32I-NEXT: or s3, t5, s1
+; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s4, a1, s2
+; RV32I-NEXT: .LBB16_52:
+; RV32I-NEXT: or a1, s10, t0
+; RV32I-NEXT: slli s11, s11, 16
+; RV32I-NEXT: or t0, s9, s8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: mv s1, a5
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: beqz ra, .LBB16_54
+; RV32I-NEXT: # %bb.53:
+; RV32I-NEXT: mv t5, s3
+; RV32I-NEXT: mv s1, s4
+; RV32I-NEXT: .LBB16_54:
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, s11, a1
+; RV32I-NEXT: or s1, t6, t0
+; RV32I-NEXT: li a1, 64
+; RV32I-NEXT: mv a6, a7
+; RV32I-NEXT: mv a7, s0
+; RV32I-NEXT: bltu ra, a1, .LBB16_56
+; RV32I-NEXT: # %bb.55:
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB16_56:
+; RV32I-NEXT: srl s3, s2, a7
+; RV32I-NEXT: sll ra, s1, a4
+; RV32I-NEXT: mv a7, s5
+; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB16_58
+; RV32I-NEXT: # %bb.57:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: j .LBB16_59
+; RV32I-NEXT: .LBB16_58:
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a1, s3, ra
+; RV32I-NEXT: .LBB16_59:
+; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: lbu t6, 31(a0)
+; RV32I-NEXT: mv t5, s1
+; RV32I-NEXT: beqz a4, .LBB16_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB16_61:
+; RV32I-NEXT: lbu s8, 25(a0)
+; RV32I-NEXT: lbu s4, 26(a0)
+; RV32I-NEXT: lbu s11, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: bltu s6, t4, .LBB16_63
+; RV32I-NEXT: # %bb.62:
+; RV32I-NEXT: srl t0, s1, s6
+; RV32I-NEXT: j .LBB16_64
+; RV32I-NEXT: .LBB16_63:
+; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sll a1, s1, a1
+; RV32I-NEXT: or t0, s3, a1
+; RV32I-NEXT: .LBB16_64:
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: lbu s3, 24(a0)
+; RV32I-NEXT: lbu a1, 28(a0)
+; RV32I-NEXT: or s4, s9, s4
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t6, t6, s10
+; RV32I-NEXT: mv s9, s2
+; RV32I-NEXT: beqz s6, .LBB16_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: mv s9, t0
+; RV32I-NEXT: .LBB16_66:
+; RV32I-NEXT: or a0, s8, s3
+; RV32I-NEXT: slli t0, s4, 16
+; RV32I-NEXT: or a1, s11, a1
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: bltu s6, t4, .LBB16_68
+; RV32I-NEXT: # %bb.67:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB16_69
+; RV32I-NEXT: .LBB16_68:
+; RV32I-NEXT: srl s4, s1, s0
+; RV32I-NEXT: .LBB16_69:
+; RV32I-NEXT: li s11, 64
+; RV32I-NEXT: or s6, t0, a0
+; RV32I-NEXT: or a0, t6, a1
+; RV32I-NEXT: bltu a4, t4, .LBB16_71
+; RV32I-NEXT: # %bb.70:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a1, s6, a4
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: bnez a4, .LBB16_72
+; RV32I-NEXT: j .LBB16_73
+; RV32I-NEXT: .LBB16_71:
+; RV32I-NEXT: sll s3, s6, a4
+; RV32I-NEXT: srl a1, s6, s0
+; RV32I-NEXT: sll t0, a0, a4
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: beqz a4, .LBB16_73
+; RV32I-NEXT: .LBB16_72:
+; RV32I-NEXT: mv s10, a1
+; RV32I-NEXT: .LBB16_73:
+; RV32I-NEXT: bltu s7, t4, .LBB16_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, s2, s7
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: bnez s7, .LBB16_76
+; RV32I-NEXT: j .LBB16_77
+; RV32I-NEXT: .LBB16_75:
+; RV32I-NEXT: sll s5, s2, a4
+; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srl a1, s2, a1
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: beqz s7, .LBB16_77
+; RV32I-NEXT: .LBB16_76:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB16_77:
+; RV32I-NEXT: bltu a4, s11, .LBB16_79
+; RV32I-NEXT: # %bb.78:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB16_80
+; RV32I-NEXT: .LBB16_79:
+; RV32I-NEXT: or s5, s9, s3
+; RV32I-NEXT: or s0, s4, s10
+; RV32I-NEXT: .LBB16_80:
+; RV32I-NEXT: addi s9, a4, -128
+; RV32I-NEXT: mv s7, s6
+; RV32I-NEXT: mv s8, a0
+; RV32I-NEXT: beqz a4, .LBB16_82
+; RV32I-NEXT: # %bb.81:
+; RV32I-NEXT: mv s7, s5
+; RV32I-NEXT: mv s8, s0
+; RV32I-NEXT: .LBB16_82:
+; RV32I-NEXT: neg s3, s9
+; RV32I-NEXT: srl s0, t3, s3
+; RV32I-NEXT: bltu s9, t4, .LBB16_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, t3, s9
+; RV32I-NEXT: j .LBB16_85
+; RV32I-NEXT: .LBB16_84:
+; RV32I-NEXT: sll s5, t3, a4
+; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB16_85:
+; RV32I-NEXT: sub s4, s11, s9
+; RV32I-NEXT: mv t6, a5
+; RV32I-NEXT: beqz s9, .LBB16_87
+; RV32I-NEXT: # %bb.86:
+; RV32I-NEXT: mv t6, a1
+; RV32I-NEXT: .LBB16_87:
+; RV32I-NEXT: bltu s4, t4, .LBB16_89
+; RV32I-NEXT: # %bb.88:
+; RV32I-NEXT: srl a1, a5, s4
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: bnez s4, .LBB16_90
+; RV32I-NEXT: j .LBB16_91
+; RV32I-NEXT: .LBB16_89:
+; RV32I-NEXT: neg a1, s4
+; RV32I-NEXT: sll a1, a5, a1
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: beqz s4, .LBB16_91
+; RV32I-NEXT: .LBB16_90:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB16_91:
+; RV32I-NEXT: bltu s4, t4, .LBB16_94
+; RV32I-NEXT: # %bb.92:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bgeu s9, t4, .LBB16_95
+; RV32I-NEXT: .LBB16_93:
+; RV32I-NEXT: sll s10, t1, a4
+; RV32I-NEXT: srl a1, t1, s3
+; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: j .LBB16_96
+; RV32I-NEXT: .LBB16_94:
+; RV32I-NEXT: srl s4, a5, s3
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bltu s9, t4, .LBB16_93
+; RV32I-NEXT: .LBB16_95:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a1, t1, s9
+; RV32I-NEXT: .LBB16_96:
+; RV32I-NEXT: addi s11, s9, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz s9, .LBB16_98
+; RV32I-NEXT: # %bb.97:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB16_98:
+; RV32I-NEXT: bltu s11, t4, .LBB16_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: sll a1, t3, s11
+; RV32I-NEXT: bnez s11, .LBB16_101
+; RV32I-NEXT: j .LBB16_102
+; RV32I-NEXT: .LBB16_100:
+; RV32I-NEXT: sll t4, t3, s9
+; RV32I-NEXT: neg a1, s11
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: sll t0, a5, s9
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: beqz s11, .LBB16_102
+; RV32I-NEXT: .LBB16_101:
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: .LBB16_102:
+; RV32I-NEXT: bltu s9, ra, .LBB16_104
+; RV32I-NEXT: # %bb.103:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bnez s9, .LBB16_105
+; RV32I-NEXT: j .LBB16_106
+; RV32I-NEXT: .LBB16_104:
+; RV32I-NEXT: or t4, s0, s10
+; RV32I-NEXT: or a5, s4, s3
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: beqz s9, .LBB16_106
+; RV32I-NEXT: .LBB16_105:
+; RV32I-NEXT: mv t1, t4
+; RV32I-NEXT: mv t2, a5
+; RV32I-NEXT: .LBB16_106:
+; RV32I-NEXT: bltu a4, a1, .LBB16_108
+; RV32I-NEXT: # %bb.107:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: bnez a4, .LBB16_109
+; RV32I-NEXT: j .LBB16_110
+; RV32I-NEXT: .LBB16_108:
+; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s5, a1, a5
+; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t6, a1, t5
+; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a1, s7
+; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t2, a1, s8
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB16_110
+; RV32I-NEXT: .LBB16_109:
+; RV32I-NEXT: mv s2, s5
+; RV32I-NEXT: mv s1, t6
+; RV32I-NEXT: mv s6, t1
+; RV32I-NEXT: mv a0, t2
+; RV32I-NEXT: .LBB16_110:
+; RV32I-NEXT: srli a4, ra, 16
+; RV32I-NEXT: lui t2, 16
+; RV32I-NEXT: srli t1, ra, 24
+; RV32I-NEXT: srli a5, a3, 16
+; RV32I-NEXT: srli t4, a3, 24
+; RV32I-NEXT: srli t0, a7, 16
+; RV32I-NEXT: srli s0, a7, 24
+; RV32I-NEXT: srli t3, a6, 16
+; RV32I-NEXT: srli s3, a6, 24
+; RV32I-NEXT: srli t6, s2, 16
+; RV32I-NEXT: srli a1, s2, 24
+; RV32I-NEXT: srli t5, s1, 16
+; RV32I-NEXT: srli s5, s1, 24
+; RV32I-NEXT: srli s4, s6, 16
+; RV32I-NEXT: srli s7, s6, 24
+; RV32I-NEXT: srli s8, a0, 16
+; RV32I-NEXT: srli s9, a0, 24
+; RV32I-NEXT: addi t2, t2, -1
+; RV32I-NEXT: and s10, ra, t2
+; RV32I-NEXT: and s11, a3, t2
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb ra, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t1, 3(a2)
+; RV32I-NEXT: and a4, a7, t2
+; RV32I-NEXT: srli t1, s11, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb t1, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t4, 7(a2)
+; RV32I-NEXT: and a3, a6, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb t0, 10(a2)
+; RV32I-NEXT: sb s0, 11(a2)
+; RV32I-NEXT: and a4, s2, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a3, s1, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s2, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb t6, 18(a2)
+; RV32I-NEXT: sb a1, 19(a2)
+; RV32I-NEXT: and a1, s6, t2
+; RV32I-NEXT: and a4, a0, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s1, 20(a2)
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: sb t5, 22(a2)
+; RV32I-NEXT: sb s5, 23(a2)
+; RV32I-NEXT: sb s6, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s4, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb a0, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -112
+; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 12(a0)
+; RV64I-NEXT: lbu s0, 13(a0)
+; RV64I-NEXT: lbu s1, 14(a0)
+; RV64I-NEXT: lbu s2, 15(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or a6, t4, t3
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 1(a1)
+; RV64I-NEXT: lbu t2, 2(a1)
+; RV64I-NEXT: lbu t3, 3(a1)
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: or t6, t6, t5
+; RV64I-NEXT: or s0, s0, a4
+; RV64I-NEXT: or s1, s2, s1
+; RV64I-NEXT: lbu a4, 4(a1)
+; RV64I-NEXT: lbu t4, 5(a1)
+; RV64I-NEXT: lbu t5, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t1, t1, 8
+; RV64I-NEXT: slli t3, t3, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or t0, t1, t0
+; RV64I-NEXT: or t1, t3, t2
+; RV64I-NEXT: or t2, t4, a4
+; RV64I-NEXT: or a1, a1, t5
+; RV64I-NEXT: lbu t5, 19(a0)
+; RV64I-NEXT: lbu t4, 21(a0)
+; RV64I-NEXT: lbu a4, 22(a0)
+; RV64I-NEXT: lbu t3, 23(a0)
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t6, t6, 16
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: or s4, s3, a3
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a6, t6, a6
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: lbu t6, 29(a0)
+; RV64I-NEXT: lbu a3, 30(a0)
+; RV64I-NEXT: lbu s2, 31(a0)
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or s5, t1, t0
+; RV64I-NEXT: li a7, 128
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli s3, t3, 8
+; RV64I-NEXT: slli s2, s2, 8
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or t1, a5, s4
+; RV64I-NEXT: or a5, s0, a6
+; RV64I-NEXT: or a6, a1, s5
+; RV64I-NEXT: slli a6, a6, 6
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: srl s0, t1, t3
+; RV64I-NEXT: bltu a6, t0, .LBB17_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: sll s4, t1, t2
+; RV64I-NEXT: j .LBB17_3
+; RV64I-NEXT: .LBB17_2:
+; RV64I-NEXT: sll a1, t1, a6
+; RV64I-NEXT: sll s4, a5, a6
+; RV64I-NEXT: or s4, s0, s4
+; RV64I-NEXT: .LBB17_3:
+; RV64I-NEXT: slli t5, t5, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: or s3, s3, a4
+; RV64I-NEXT: lbu ra, 17(a0)
+; RV64I-NEXT: lbu s11, 18(a0)
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s5, 25(a0)
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: lbu s7, 26(a0)
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: slli s10, t6, 8
+; RV64I-NEXT: or s9, s2, a3
+; RV64I-NEXT: sub a4, a7, a6
+; RV64I-NEXT: mv a3, a5
+; RV64I-NEXT: beqz a6, .LBB17_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a3, s4
+; RV64I-NEXT: .LBB17_5:
+; RV64I-NEXT: slli t6, ra, 8
+; RV64I-NEXT: or t5, t5, s11
+; RV64I-NEXT: or t4, t4, s8
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: lbu s8, 16(a0)
+; RV64I-NEXT: lbu a0, 24(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: or s2, s1, s7
+; RV64I-NEXT: or s1, s10, s6
+; RV64I-NEXT: slli s4, s9, 16
+; RV64I-NEXT: bltu a4, t0, .LBB17_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: srl s0, a5, s0
+; RV64I-NEXT: j .LBB17_8
+; RV64I-NEXT: .LBB17_7:
+; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: sll s6, a5, s6
+; RV64I-NEXT: or s0, s0, s6
+; RV64I-NEXT: .LBB17_8:
+; RV64I-NEXT: or t6, t6, s8
+; RV64I-NEXT: slli s6, t5, 16
+; RV64I-NEXT: or s3, s3, t4
+; RV64I-NEXT: or t5, s5, a0
+; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: or s1, s4, s1
+; RV64I-NEXT: mv t4, t1
+; RV64I-NEXT: beqz a4, .LBB17_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t4, s0
+; RV64I-NEXT: .LBB17_10:
+; RV64I-NEXT: or a0, s6, t6
+; RV64I-NEXT: slli s0, s3, 32
+; RV64I-NEXT: or t6, s2, t5
+; RV64I-NEXT: slli s1, s1, 32
+; RV64I-NEXT: bltu a4, t0, .LBB17_12
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: j .LBB17_13
+; RV64I-NEXT: .LBB17_12:
+; RV64I-NEXT: srl t5, a5, t3
+; RV64I-NEXT: .LBB17_13:
+; RV64I-NEXT: or a4, s0, a0
+; RV64I-NEXT: or a0, s1, t6
+; RV64I-NEXT: bltu a6, t0, .LBB17_15
+; RV64I-NEXT: # %bb.14:
+; RV64I-NEXT: li t6, 0
+; RV64I-NEXT: sll t2, a4, t2
+; RV64I-NEXT: j .LBB17_16
+; RV64I-NEXT: .LBB17_15:
+; RV64I-NEXT: sll t6, a4, a6
+; RV64I-NEXT: srl t2, a4, t3
+; RV64I-NEXT: sll t3, a0, a6
+; RV64I-NEXT: or t2, t2, t3
+; RV64I-NEXT: .LBB17_16:
+; RV64I-NEXT: sub s0, a6, a7
+; RV64I-NEXT: mv t3, a0
+; RV64I-NEXT: beqz a6, .LBB17_18
+; RV64I-NEXT: # %bb.17:
+; RV64I-NEXT: mv t3, t2
+; RV64I-NEXT: .LBB17_18:
+; RV64I-NEXT: bltu s0, t0, .LBB17_20
+; RV64I-NEXT: # %bb.19:
+; RV64I-NEXT: li t2, 0
+; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: sll t0, t1, t0
+; RV64I-NEXT: bnez s0, .LBB17_21
+; RV64I-NEXT: j .LBB17_22
+; RV64I-NEXT: .LBB17_20:
+; RV64I-NEXT: sll t2, t1, s0
+; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: srl t0, t1, t0
+; RV64I-NEXT: sll t1, a5, s0
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: beqz s0, .LBB17_22
+; RV64I-NEXT: .LBB17_21:
+; RV64I-NEXT: mv a5, t0
+; RV64I-NEXT: .LBB17_22:
+; RV64I-NEXT: bltu a6, a7, .LBB17_24
+; RV64I-NEXT: # %bb.23:
+; RV64I-NEXT: li a1, 0
+; RV64I-NEXT: li a3, 0
+; RV64I-NEXT: bnez a6, .LBB17_25
+; RV64I-NEXT: j .LBB17_26
+; RV64I-NEXT: .LBB17_24:
+; RV64I-NEXT: or t2, t4, t6
+; RV64I-NEXT: or a5, t5, t3
+; RV64I-NEXT: beqz a6, .LBB17_26
+; RV64I-NEXT: .LBB17_25:
+; RV64I-NEXT: mv a4, t2
+; RV64I-NEXT: mv a0, a5
+; RV64I-NEXT: .LBB17_26:
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: srliw a6, a1, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a1, 24
+; RV64I-NEXT: srli t0, a1, 48
+; RV64I-NEXT: srli t5, a1, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a4, 32
+; RV64I-NEXT: srliw s2, a4, 16
+; RV64I-NEXT: srliw s6, a4, 24
+; RV64I-NEXT: srli s4, a4, 48
+; RV64I-NEXT: srli s7, a4, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a1, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a1, a5, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a1, a3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a1, a7, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a1, a4, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 112
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 5(a0)
+; RV32I-NEXT: lbu t2, 6(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a3
+; RV32I-NEXT: lbu a6, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: li s9, 64
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: li t4, 32
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or t3, a5, a4
+; RV32I-NEXT: or a5, t2, a7
+; RV32I-NEXT: or a4, a1, t0
+; RV32I-NEXT: slli a4, a4, 6
+; RV32I-NEXT: neg s10, a4
+; RV32I-NEXT: srl t5, t3, s10
+; RV32I-NEXT: sll s5, a5, a4
+; RV32I-NEXT: bltu a4, t4, .LBB17_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: sll a7, t3, a4
+; RV32I-NEXT: j .LBB17_3
+; RV32I-NEXT: .LBB17_2:
+; RV32I-NEXT: sll s8, t3, a4
+; RV32I-NEXT: or a7, t5, s5
+; RV32I-NEXT: .LBB17_3:
+; RV32I-NEXT: lbu t2, 9(a0)
+; RV32I-NEXT: lbu a1, 10(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t6, a3, 8
+; RV32I-NEXT: sub s6, s9, a4
+; RV32I-NEXT: mv a3, a5
+; RV32I-NEXT: beqz a4, .LBB17_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv a3, a7
+; RV32I-NEXT: .LBB17_5:
+; RV32I-NEXT: slli a7, t2, 8
+; RV32I-NEXT: or a6, a6, a1
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu a1, 12(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: neg t6, s6
+; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s6, t4, .LBB17_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srl t6, a5, s6
+; RV32I-NEXT: j .LBB17_8
+; RV32I-NEXT: .LBB17_7:
+; RV32I-NEXT: sll t6, a5, t6
+; RV32I-NEXT: or t6, t5, t6
+; RV32I-NEXT: .LBB17_8:
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t2, a6, 16
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: mv a6, t3
+; RV32I-NEXT: beqz s6, .LBB17_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv a6, t6
+; RV32I-NEXT: .LBB17_10:
+; RV32I-NEXT: or t1, t2, a7
+; RV32I-NEXT: or t2, t0, a1
+; RV32I-NEXT: bltu s6, t4, .LBB17_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: j .LBB17_13
+; RV32I-NEXT: .LBB17_12:
+; RV32I-NEXT: srl a7, a5, s10
+; RV32I-NEXT: .LBB17_13:
+; RV32I-NEXT: srl s0, t1, s10
+; RV32I-NEXT: sll a1, t2, a4
+; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB17_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, a4
+; RV32I-NEXT: j .LBB17_16
+; RV32I-NEXT: .LBB17_15:
+; RV32I-NEXT: sll s1, t1, a4
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB17_16:
+; RV32I-NEXT: addi s7, a4, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz a4, .LBB17_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB17_18:
+; RV32I-NEXT: neg a1, s7
+; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s7, t4, .LBB17_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: li s2, 0
+; RV32I-NEXT: sll a1, t3, s7
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: bnez s7, .LBB17_21
+; RV32I-NEXT: j .LBB17_22
+; RV32I-NEXT: .LBB17_20:
+; RV32I-NEXT: sll s2, t3, a4
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: or a1, a1, s5
+; RV32I-NEXT: mv s4, a5
+; RV32I-NEXT: beqz s7, .LBB17_22
+; RV32I-NEXT: .LBB17_21:
+; RV32I-NEXT: mv s4, a1
+; RV32I-NEXT: .LBB17_22:
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bltu a4, s9, .LBB17_24
+; RV32I-NEXT: # %bb.23:
+; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: j .LBB17_25
+; RV32I-NEXT: .LBB17_24:
+; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, a6, s1
+; RV32I-NEXT: or s4, a7, s3
+; RV32I-NEXT: .LBB17_25:
+; RV32I-NEXT: sub ra, a1, a4
+; RV32I-NEXT: mv a7, t1
+; RV32I-NEXT: mv a6, t2
+; RV32I-NEXT: beqz a4, .LBB17_27
+; RV32I-NEXT: # %bb.26:
+; RV32I-NEXT: mv a7, s2
+; RV32I-NEXT: mv a6, s4
+; RV32I-NEXT: .LBB17_27:
+; RV32I-NEXT: neg s1, ra
+; RV32I-NEXT: sll s2, t2, s1
+; RV32I-NEXT: bltu ra, t4, .LBB17_29
+; RV32I-NEXT: # %bb.28:
+; RV32I-NEXT: srl a1, t2, ra
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bnez ra, .LBB17_30
+; RV32I-NEXT: j .LBB17_31
+; RV32I-NEXT: .LBB17_29:
+; RV32I-NEXT: or a1, s0, s2
+; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: beqz ra, .LBB17_31
+; RV32I-NEXT: .LBB17_30:
+; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB17_31:
+; RV32I-NEXT: bltu ra, t4, .LBB17_33
+; RV32I-NEXT: # %bb.32:
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: srl a1, a5, ra
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: bnez ra, .LBB17_34
+; RV32I-NEXT: j .LBB17_35
+; RV32I-NEXT: .LBB17_33:
+; RV32I-NEXT: srl a1, t2, s10
+; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, a5, s1
+; RV32I-NEXT: or a1, t5, a1
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: beqz ra, .LBB17_35
+; RV32I-NEXT: .LBB17_34:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB17_35:
+; RV32I-NEXT: sub s3, s9, ra
+; RV32I-NEXT: bltu ra, t4, .LBB17_38
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s3, t4, .LBB17_39
+; RV32I-NEXT: .LBB17_37:
+; RV32I-NEXT: sll s1, t1, s1
+; RV32I-NEXT: neg a1, s3
+; RV32I-NEXT: srl a1, t1, a1
+; RV32I-NEXT: or a1, a1, s2
+; RV32I-NEXT: j .LBB17_40
+; RV32I-NEXT: .LBB17_38:
+; RV32I-NEXT: srl a1, a5, s10
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s3, t4, .LBB17_37
+; RV32I-NEXT: .LBB17_39:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sll a1, t1, s3
+; RV32I-NEXT: .LBB17_40:
+; RV32I-NEXT: addi s4, ra, -64
+; RV32I-NEXT: mv s2, t2
+; RV32I-NEXT: beqz s3, .LBB17_42
+; RV32I-NEXT: # %bb.41:
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: .LBB17_42:
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s5, a7
+; RV32I-NEXT: bltu s4, t4, .LBB17_44
+; RV32I-NEXT: # %bb.43:
+; RV32I-NEXT: srl t0, t2, s4
+; RV32I-NEXT: j .LBB17_45
+; RV32I-NEXT: .LBB17_44:
+; RV32I-NEXT: srl a1, t1, ra
+; RV32I-NEXT: neg t0, s4
+; RV32I-NEXT: sll t0, t2, t0
+; RV32I-NEXT: or t0, a1, t0
+; RV32I-NEXT: .LBB17_45:
+; RV32I-NEXT: mv s0, s10
+; RV32I-NEXT: mv a7, a6
+; RV32I-NEXT: lbu s8, 19(a0)
+; RV32I-NEXT: lbu a1, 23(a0)
+; RV32I-NEXT: mv s3, t1
+; RV32I-NEXT: beqz s4, .LBB17_47
+; RV32I-NEXT: # %bb.46:
+; RV32I-NEXT: mv s3, t0
+; RV32I-NEXT: .LBB17_47:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: lbu s10, 17(a0)
+; RV32I-NEXT: lbu t0, 18(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu t6, 22(a0)
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: li a3, 64
+; RV32I-NEXT: bltu s4, t4, .LBB17_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB17_50
+; RV32I-NEXT: .LBB17_49:
+; RV32I-NEXT: srl s4, t2, ra
+; RV32I-NEXT: .LBB17_50:
+; RV32I-NEXT: or s11, s8, t0
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t6, a1, t6
+; RV32I-NEXT: bgeu ra, a3, .LBB17_52
+; RV32I-NEXT: # %bb.51:
+; RV32I-NEXT: or s3, t5, s1
+; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s4, a1, s2
+; RV32I-NEXT: .LBB17_52:
+; RV32I-NEXT: or a1, s10, t0
+; RV32I-NEXT: slli s11, s11, 16
+; RV32I-NEXT: or t0, s9, s8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: mv t5, t3
+; RV32I-NEXT: mv s1, a5
+; RV32I-NEXT: mv a3, a6
+; RV32I-NEXT: beqz ra, .LBB17_54
+; RV32I-NEXT: # %bb.53:
+; RV32I-NEXT: mv t5, s3
+; RV32I-NEXT: mv s1, s4
+; RV32I-NEXT: .LBB17_54:
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or s2, s11, a1
+; RV32I-NEXT: or s1, t6, t0
+; RV32I-NEXT: li a1, 64
+; RV32I-NEXT: mv a6, a7
+; RV32I-NEXT: mv a7, s0
+; RV32I-NEXT: bltu ra, a1, .LBB17_56
+; RV32I-NEXT: # %bb.55:
+; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB17_56:
+; RV32I-NEXT: srl s3, s2, a7
+; RV32I-NEXT: sll ra, s1, a4
+; RV32I-NEXT: mv a7, s5
+; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a4, t4, .LBB17_58
+; RV32I-NEXT: # %bb.57:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: j .LBB17_59
+; RV32I-NEXT: .LBB17_58:
+; RV32I-NEXT: sll a1, s2, a4
+; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a1, s3, ra
+; RV32I-NEXT: .LBB17_59:
+; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: lbu t6, 31(a0)
+; RV32I-NEXT: mv t5, s1
+; RV32I-NEXT: beqz a4, .LBB17_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: mv t5, a1
+; RV32I-NEXT: .LBB17_61:
+; RV32I-NEXT: lbu s8, 25(a0)
+; RV32I-NEXT: lbu s4, 26(a0)
+; RV32I-NEXT: lbu s11, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: bltu s6, t4, .LBB17_63
+; RV32I-NEXT: # %bb.62:
+; RV32I-NEXT: srl t0, s1, s6
+; RV32I-NEXT: j .LBB17_64
+; RV32I-NEXT: .LBB17_63:
+; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sll a1, s1, a1
+; RV32I-NEXT: or t0, s3, a1
+; RV32I-NEXT: .LBB17_64:
+; RV32I-NEXT: slli s8, s8, 8
+; RV32I-NEXT: lbu s3, 24(a0)
+; RV32I-NEXT: lbu a1, 28(a0)
+; RV32I-NEXT: or s4, s9, s4
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t6, t6, s10
+; RV32I-NEXT: mv s9, s2
+; RV32I-NEXT: beqz s6, .LBB17_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: mv s9, t0
+; RV32I-NEXT: .LBB17_66:
+; RV32I-NEXT: or a0, s8, s3
+; RV32I-NEXT: slli t0, s4, 16
+; RV32I-NEXT: or a1, s11, a1
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: bltu s6, t4, .LBB17_68
+; RV32I-NEXT: # %bb.67:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: j .LBB17_69
+; RV32I-NEXT: .LBB17_68:
+; RV32I-NEXT: srl s4, s1, s0
+; RV32I-NEXT: .LBB17_69:
+; RV32I-NEXT: li s11, 64
+; RV32I-NEXT: or s6, t0, a0
+; RV32I-NEXT: or a0, t6, a1
+; RV32I-NEXT: bltu a4, t4, .LBB17_71
+; RV32I-NEXT: # %bb.70:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a1, s6, a4
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: bnez a4, .LBB17_72
+; RV32I-NEXT: j .LBB17_73
+; RV32I-NEXT: .LBB17_71:
+; RV32I-NEXT: sll s3, s6, a4
+; RV32I-NEXT: srl a1, s6, s0
+; RV32I-NEXT: sll t0, a0, a4
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: mv s10, a0
+; RV32I-NEXT: beqz a4, .LBB17_73
+; RV32I-NEXT: .LBB17_72:
+; RV32I-NEXT: mv s10, a1
+; RV32I-NEXT: .LBB17_73:
+; RV32I-NEXT: bltu s7, t4, .LBB17_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, s2, s7
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: bnez s7, .LBB17_76
+; RV32I-NEXT: j .LBB17_77
+; RV32I-NEXT: .LBB17_75:
+; RV32I-NEXT: sll s5, s2, a4
+; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srl a1, s2, a1
+; RV32I-NEXT: or a1, a1, ra
+; RV32I-NEXT: mv s0, s1
+; RV32I-NEXT: beqz s7, .LBB17_77
+; RV32I-NEXT: .LBB17_76:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB17_77:
+; RV32I-NEXT: bltu a4, s11, .LBB17_79
+; RV32I-NEXT: # %bb.78:
+; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li t5, 0
+; RV32I-NEXT: j .LBB17_80
+; RV32I-NEXT: .LBB17_79:
+; RV32I-NEXT: or s5, s9, s3
+; RV32I-NEXT: or s0, s4, s10
+; RV32I-NEXT: .LBB17_80:
+; RV32I-NEXT: addi s9, a4, -128
+; RV32I-NEXT: mv s7, s6
+; RV32I-NEXT: mv s8, a0
+; RV32I-NEXT: beqz a4, .LBB17_82
+; RV32I-NEXT: # %bb.81:
+; RV32I-NEXT: mv s7, s5
+; RV32I-NEXT: mv s8, s0
+; RV32I-NEXT: .LBB17_82:
+; RV32I-NEXT: neg s3, s9
+; RV32I-NEXT: srl s0, t3, s3
+; RV32I-NEXT: bltu s9, t4, .LBB17_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: sll a1, t3, s9
+; RV32I-NEXT: j .LBB17_85
+; RV32I-NEXT: .LBB17_84:
+; RV32I-NEXT: sll s5, t3, a4
+; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: .LBB17_85:
+; RV32I-NEXT: sub s4, s11, s9
+; RV32I-NEXT: mv t6, a5
+; RV32I-NEXT: beqz s9, .LBB17_87
+; RV32I-NEXT: # %bb.86:
+; RV32I-NEXT: mv t6, a1
+; RV32I-NEXT: .LBB17_87:
+; RV32I-NEXT: bltu s4, t4, .LBB17_89
+; RV32I-NEXT: # %bb.88:
+; RV32I-NEXT: srl a1, a5, s4
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: bnez s4, .LBB17_90
+; RV32I-NEXT: j .LBB17_91
+; RV32I-NEXT: .LBB17_89:
+; RV32I-NEXT: neg a1, s4
+; RV32I-NEXT: sll a1, a5, a1
+; RV32I-NEXT: or a1, s0, a1
+; RV32I-NEXT: mv s0, t3
+; RV32I-NEXT: beqz s4, .LBB17_91
+; RV32I-NEXT: .LBB17_90:
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: .LBB17_91:
+; RV32I-NEXT: bltu s4, t4, .LBB17_94
+; RV32I-NEXT: # %bb.92:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bgeu s9, t4, .LBB17_95
+; RV32I-NEXT: .LBB17_93:
+; RV32I-NEXT: sll s10, t1, a4
+; RV32I-NEXT: srl a1, t1, s3
+; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: j .LBB17_96
+; RV32I-NEXT: .LBB17_94:
+; RV32I-NEXT: srl s4, a5, s3
+; RV32I-NEXT: li ra, 64
+; RV32I-NEXT: bltu s9, t4, .LBB17_93
+; RV32I-NEXT: .LBB17_95:
+; RV32I-NEXT: li s10, 0
+; RV32I-NEXT: sll a1, t1, s9
+; RV32I-NEXT: .LBB17_96:
+; RV32I-NEXT: addi s11, s9, -64
+; RV32I-NEXT: mv s3, t2
+; RV32I-NEXT: beqz s9, .LBB17_98
+; RV32I-NEXT: # %bb.97:
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: .LBB17_98:
+; RV32I-NEXT: bltu s11, t4, .LBB17_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: li t4, 0
+; RV32I-NEXT: sll a1, t3, s11
+; RV32I-NEXT: bnez s11, .LBB17_101
+; RV32I-NEXT: j .LBB17_102
+; RV32I-NEXT: .LBB17_100:
+; RV32I-NEXT: sll t4, t3, s9
+; RV32I-NEXT: neg a1, s11
+; RV32I-NEXT: srl a1, t3, a1
+; RV32I-NEXT: sll t0, a5, s9
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: beqz s11, .LBB17_102
+; RV32I-NEXT: .LBB17_101:
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: .LBB17_102:
+; RV32I-NEXT: bltu s9, ra, .LBB17_104
+; RV32I-NEXT: # %bb.103:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li t6, 0
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: bnez s9, .LBB17_105
+; RV32I-NEXT: j .LBB17_106
+; RV32I-NEXT: .LBB17_104:
+; RV32I-NEXT: or t4, s0, s10
+; RV32I-NEXT: or a5, s4, s3
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: beqz s9, .LBB17_106
+; RV32I-NEXT: .LBB17_105:
+; RV32I-NEXT: mv t1, t4
+; RV32I-NEXT: mv t2, a5
+; RV32I-NEXT: .LBB17_106:
+; RV32I-NEXT: bltu a4, a1, .LBB17_108
+; RV32I-NEXT: # %bb.107:
+; RV32I-NEXT: li ra, 0
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: li a7, 0
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: bnez a4, .LBB17_109
+; RV32I-NEXT: j .LBB17_110
+; RV32I-NEXT: .LBB17_108:
+; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s5, a1, a5
+; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t6, a1, t5
+; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a1, s7
+; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t2, a1, s8
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: beqz a4, .LBB17_110
+; RV32I-NEXT: .LBB17_109:
+; RV32I-NEXT: mv s2, s5
+; RV32I-NEXT: mv s1, t6
+; RV32I-NEXT: mv s6, t1
+; RV32I-NEXT: mv a0, t2
+; RV32I-NEXT: .LBB17_110:
+; RV32I-NEXT: srli a4, ra, 16
+; RV32I-NEXT: lui t2, 16
+; RV32I-NEXT: srli t1, ra, 24
+; RV32I-NEXT: srli a5, a3, 16
+; RV32I-NEXT: srli t4, a3, 24
+; RV32I-NEXT: srli t0, a7, 16
+; RV32I-NEXT: srli s0, a7, 24
+; RV32I-NEXT: srli t3, a6, 16
+; RV32I-NEXT: srli s3, a6, 24
+; RV32I-NEXT: srli t6, s2, 16
+; RV32I-NEXT: srli a1, s2, 24
+; RV32I-NEXT: srli t5, s1, 16
+; RV32I-NEXT: srli s5, s1, 24
+; RV32I-NEXT: srli s4, s6, 16
+; RV32I-NEXT: srli s7, s6, 24
+; RV32I-NEXT: srli s8, a0, 16
+; RV32I-NEXT: srli s9, a0, 24
+; RV32I-NEXT: addi t2, t2, -1
+; RV32I-NEXT: and s10, ra, t2
+; RV32I-NEXT: and s11, a3, t2
+; RV32I-NEXT: srli s10, s10, 8
+; RV32I-NEXT: sb ra, 0(a2)
+; RV32I-NEXT: sb s10, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb t1, 3(a2)
+; RV32I-NEXT: and a4, a7, t2
+; RV32I-NEXT: srli t1, s11, 8
+; RV32I-NEXT: sb a3, 4(a2)
+; RV32I-NEXT: sb t1, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t4, 7(a2)
+; RV32I-NEXT: and a3, a6, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb t0, 10(a2)
+; RV32I-NEXT: sb s0, 11(a2)
+; RV32I-NEXT: and a4, s2, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a3, s1, t2
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s2, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb t6, 18(a2)
+; RV32I-NEXT: sb a1, 19(a2)
+; RV32I-NEXT: and a1, s6, t2
+; RV32I-NEXT: and a4, a0, t2
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb s1, 20(a2)
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: sb t5, 22(a2)
+; RV32I-NEXT: sb s5, 23(a2)
+; RV32I-NEXT: sb s6, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s4, 26(a2)
+; RV32I-NEXT: sb s7, 27(a2)
+; RV32I-NEXT: sb a0, 28(a2)
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb s8, 30(a2)
+; RV32I-NEXT: sb s9, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or a7, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t4, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli t3, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a7, t1, a7
+; RV64I-NEXT: or a5, t5, t4
+; RV64I-NEXT: or a6, s0, t6
+; RV64I-NEXT: slli a6, a6, 3
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sll t4, a5, t5
+; RV64I-NEXT: bltu a6, t0, .LBB18_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sra t6, a5, t1
+; RV64I-NEXT: j .LBB18_3
+; RV64I-NEXT: .LBB18_2:
+; RV64I-NEXT: srl t6, a7, a6
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: .LBB18_3:
+; RV64I-NEXT: or a3, t3, a3
+; RV64I-NEXT: slli t3, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a7
+; RV64I-NEXT: beqz a6, .LBB18_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB18_5:
+; RV64I-NEXT: or a4, t3, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a6, t0, .LBB18_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: srai a0, a5, 63
+; RV64I-NEXT: srl t3, a3, t1
+; RV64I-NEXT: j .LBB18_8
+; RV64I-NEXT: .LBB18_7:
+; RV64I-NEXT: sra a0, a5, a6
+; RV64I-NEXT: srl t1, a4, a6
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t3, t1, t2
+; RV64I-NEXT: .LBB18_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a6, .LBB18_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t3
+; RV64I-NEXT: .LBB18_10:
+; RV64I-NEXT: sub t6, t1, a6
+; RV64I-NEXT: bltu a6, t0, .LBB18_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t3, 0
+; RV64I-NEXT: bgeu t6, t0, .LBB18_14
+; RV64I-NEXT: .LBB18_12:
+; RV64I-NEXT: sll t5, a7, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a7, s0
+; RV64I-NEXT: or s1, s0, t4
+; RV64I-NEXT: j .LBB18_15
+; RV64I-NEXT: .LBB18_13:
+; RV64I-NEXT: srl t3, a3, a6
+; RV64I-NEXT: bltu t6, t0, .LBB18_12
+; RV64I-NEXT: .LBB18_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: sll s1, a7, t4
+; RV64I-NEXT: .LBB18_15:
+; RV64I-NEXT: sub s0, a6, t1
+; RV64I-NEXT: mv t4, a5
+; RV64I-NEXT: beqz t6, .LBB18_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t4, s1
+; RV64I-NEXT: .LBB18_17:
+; RV64I-NEXT: bltu s0, t0, .LBB18_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: sra t6, a5, t6
+; RV64I-NEXT: bnez s0, .LBB18_20
+; RV64I-NEXT: j .LBB18_21
+; RV64I-NEXT: .LBB18_19:
+; RV64I-NEXT: srl t6, a7, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, a5, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB18_21
+; RV64I-NEXT: .LBB18_20:
+; RV64I-NEXT: mv a7, t6
+; RV64I-NEXT: .LBB18_21:
+; RV64I-NEXT: bltu s0, t0, .LBB18_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: srai t0, a5, 63
+; RV64I-NEXT: bltu a6, t1, .LBB18_24
+; RV64I-NEXT: j .LBB18_25
+; RV64I-NEXT: .LBB18_23:
+; RV64I-NEXT: sra t0, a5, s0
+; RV64I-NEXT: bgeu a6, t1, .LBB18_25
+; RV64I-NEXT: .LBB18_24:
+; RV64I-NEXT: or a7, t2, t5
+; RV64I-NEXT: or t0, t3, t4
+; RV64I-NEXT: .LBB18_25:
+; RV64I-NEXT: bnez a6, .LBB18_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a6, t1, .LBB18_28
+; RV64I-NEXT: .LBB18_27:
+; RV64I-NEXT: srai a1, a5, 63
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: .LBB18_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB18_29:
+; RV64I-NEXT: mv a4, a7
+; RV64I-NEXT: mv a3, t0
+; RV64I-NEXT: bgeu a6, t1, .LBB18_27
+; RV64I-NEXT: j .LBB18_28
+;
+; RV32I-LABEL: ashr_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or t4, t6, t5
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t5, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: or s0, t5, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t5, 32
+; RV32I-NEXT: slli a7, a4, 16
+; RV32I-NEXT: slli a1, a5, 16
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a5, t2, 16
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or a4, t1, t0
+; RV32I-NEXT: or a5, a5, s0
+; RV32I-NEXT: slli a5, a5, 3
+; RV32I-NEXT: srl s0, t2, a5
+; RV32I-NEXT: neg s6, a5
+; RV32I-NEXT: sll s1, a4, s6
+; RV32I-NEXT: bltu a5, t5, .LBB18_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t0, a4, a5
+; RV32I-NEXT: j .LBB18_3
+; RV32I-NEXT: .LBB18_2:
+; RV32I-NEXT: or t0, s0, s1
+; RV32I-NEXT: .LBB18_3:
+; RV32I-NEXT: or t1, a7, a3
+; RV32I-NEXT: or a7, a1, a6
+; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: beqz a5, .LBB18_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t3, t0
+; RV32I-NEXT: .LBB18_5:
+; RV32I-NEXT: srl a3, t1, a5
+; RV32I-NEXT: sll a1, a7, s6
+; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a5, t5, .LBB18_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai t4, a4, 31
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: j .LBB18_8
+; RV32I-NEXT: .LBB18_7:
+; RV32I-NEXT: sra t4, a4, a5
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: .LBB18_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: beqz a5, .LBB18_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: .LBB18_10:
+; RV32I-NEXT: sub s7, t6, a5
+; RV32I-NEXT: bltu a5, t5, .LBB18_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB18_13
+; RV32I-NEXT: .LBB18_12:
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: .LBB18_13:
+; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: bltu s7, t5, .LBB18_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a3, t2, s7
+; RV32I-NEXT: j .LBB18_16
+; RV32I-NEXT: .LBB18_15:
+; RV32I-NEXT: sll a6, t2, s6
+; RV32I-NEXT: srl a3, t2, s10
+; RV32I-NEXT: or a3, a3, s1
+; RV32I-NEXT: .LBB18_16:
+; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi s9, a5, -64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz s7, .LBB18_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, a3
+; RV32I-NEXT: .LBB18_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t5, .LBB18_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: sra s0, a4, s9
+; RV32I-NEXT: j .LBB18_21
+; RV32I-NEXT: .LBB18_20:
+; RV32I-NEXT: sll a3, a4, s11
+; RV32I-NEXT: or s0, s0, a3
+; RV32I-NEXT: .LBB18_21:
+; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: beqz s9, .LBB18_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv t4, s0
+; RV32I-NEXT: .LBB18_23:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu s1, 10(a0)
+; RV32I-NEXT: lbu s8, 13(a0)
+; RV32I-NEXT: lbu ra, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: bltu s9, t5, .LBB18_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: srai s0, a4, 31
+; RV32I-NEXT: j .LBB18_26
+; RV32I-NEXT: .LBB18_25:
+; RV32I-NEXT: sra s0, a4, a5
+; RV32I-NEXT: .LBB18_26:
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: lbu s5, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s4, s8, 8
+; RV32I-NEXT: or s8, a3, ra
+; RV32I-NEXT: bgeu a5, t6, .LBB18_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or t4, t0, a6
+; RV32I-NEXT: or s0, a1, t3
+; RV32I-NEXT: .LBB18_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: or a6, s2, s5
+; RV32I-NEXT: slli s2, s1, 16
+; RV32I-NEXT: or s1, s4, s3
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: mv t0, a7
+; RV32I-NEXT: beqz a5, .LBB18_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv a1, t4
+; RV32I-NEXT: mv t0, s0
+; RV32I-NEXT: .LBB18_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s3, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s4, t3, 8
+; RV32I-NEXT: or t4, s2, a6
+; RV32I-NEXT: or t3, s8, s1
+; RV32I-NEXT: bltu a5, t6, .LBB18_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: srai a6, a4, 31
+; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB18_32:
+; RV32I-NEXT: slli a6, ra, 8
+; RV32I-NEXT: or a3, s5, a3
+; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s0, s4, s0
+; RV32I-NEXT: srl s2, t4, a5
+; RV32I-NEXT: sll ra, t3, s6
+; RV32I-NEXT: bltu a5, t5, .LBB18_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: j .LBB18_35
+; RV32I-NEXT: .LBB18_34:
+; RV32I-NEXT: or s4, s2, ra
+; RV32I-NEXT: .LBB18_35:
+; RV32I-NEXT: or a6, a6, s1
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: or a0, s3, a0
+; RV32I-NEXT: slli s1, s0, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a5, .LBB18_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s4
+; RV32I-NEXT: .LBB18_37:
+; RV32I-NEXT: or s0, a3, a6
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: bltu a5, t5, .LBB18_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: srl a3, a0, a5
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: bnez a5, .LBB18_40
+; RV32I-NEXT: j .LBB18_41
+; RV32I-NEXT: .LBB18_39:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: srl a3, s0, a5
+; RV32I-NEXT: sll a6, a0, s6
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: beqz a5, .LBB18_41
+; RV32I-NEXT: .LBB18_40:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB18_41:
+; RV32I-NEXT: bltu a5, t5, .LBB18_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t5, .LBB18_45
+; RV32I-NEXT: .LBB18_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: bnez s7, .LBB18_46
+; RV32I-NEXT: j .LBB18_47
+; RV32I-NEXT: .LBB18_44:
+; RV32I-NEXT: srl s1, a0, a5
+; RV32I-NEXT: bltu s7, t5, .LBB18_43
+; RV32I-NEXT: .LBB18_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: beqz s7, .LBB18_47
+; RV32I-NEXT: .LBB18_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB18_47:
+; RV32I-NEXT: bltu s9, t5, .LBB18_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t3, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB18_50
+; RV32I-NEXT: j .LBB18_51
+; RV32I-NEXT: .LBB18_49:
+; RV32I-NEXT: sll a3, t3, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB18_51
+; RV32I-NEXT: .LBB18_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB18_51:
+; RV32I-NEXT: bltu s9, t5, .LBB18_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a5, t6, .LBB18_54
+; RV32I-NEXT: j .LBB18_55
+; RV32I-NEXT: .LBB18_53:
+; RV32I-NEXT: srl s7, t3, a5
+; RV32I-NEXT: bgeu a5, t6, .LBB18_55
+; RV32I-NEXT: .LBB18_54:
+; RV32I-NEXT: or s2, a6, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB18_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a5, .LBB18_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a6, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB18_57:
+; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a5
+; RV32I-NEXT: bltu a5, t6, .LBB18_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: .LBB18_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a6, t1, s3
+; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB18_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: j .LBB18_62
+; RV32I-NEXT: .LBB18_61:
+; RV32I-NEXT: sll s11, t1, s6
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: .LBB18_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a7
+; RV32I-NEXT: beqz s2, .LBB18_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB18_64:
+; RV32I-NEXT: bltu s1, t5, .LBB18_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a7, s1
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez s1, .LBB18_67
+; RV32I-NEXT: j .LBB18_68
+; RV32I-NEXT: .LBB18_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a7, a3
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz s1, .LBB18_68
+; RV32I-NEXT: .LBB18_67:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB18_68:
+; RV32I-NEXT: bltu s1, t5, .LBB18_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t5, .LBB18_72
+; RV32I-NEXT: .LBB18_70:
+; RV32I-NEXT: sll s6, t2, s6
+; RV32I-NEXT: srl a3, t2, s3
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB18_73
+; RV32I-NEXT: .LBB18_71:
+; RV32I-NEXT: srl s1, a7, s3
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB18_70
+; RV32I-NEXT: .LBB18_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t2, s2
+; RV32I-NEXT: .LBB18_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, a4
+; RV32I-NEXT: beqz s2, .LBB18_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB18_75:
+; RV32I-NEXT: bltu s9, t5, .LBB18_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t1, s9
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: bnez s9, .LBB18_78
+; RV32I-NEXT: j .LBB18_79
+; RV32I-NEXT: .LBB18_77:
+; RV32I-NEXT: sll s3, t1, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: sll s4, a7, s2
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: beqz s9, .LBB18_79
+; RV32I-NEXT: .LBB18_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB18_79:
+; RV32I-NEXT: bltu s2, t6, .LBB18_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB18_82
+; RV32I-NEXT: .LBB18_81:
+; RV32I-NEXT: or s3, a6, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB18_82:
+; RV32I-NEXT: addi ra, a5, -128
+; RV32I-NEXT: mv s4, t2
+; RV32I-NEXT: mv s6, a4
+; RV32I-NEXT: beqz s2, .LBB18_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s4, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB18_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, a4, s9
+; RV32I-NEXT: bltu ra, t5, .LBB18_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: sra a3, a4, ra
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: bnez ra, .LBB18_87
+; RV32I-NEXT: j .LBB18_88
+; RV32I-NEXT: .LBB18_86:
+; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: beqz ra, .LBB18_88
+; RV32I-NEXT: .LBB18_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB18_88:
+; RV32I-NEXT: bltu ra, t5, .LBB18_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: srai s2, a4, 31
+; RV32I-NEXT: srl a3, a7, ra
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez ra, .LBB18_91
+; RV32I-NEXT: j .LBB18_92
+; RV32I-NEXT: .LBB18_90:
+; RV32I-NEXT: sra s2, a4, a5
+; RV32I-NEXT: sll a3, a7, s9
+; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz ra, .LBB18_92
+; RV32I-NEXT: .LBB18_91:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB18_92:
+; RV32I-NEXT: mv s5, t0
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: bltu ra, t5, .LBB18_94
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: j .LBB18_95
+; RV32I-NEXT: .LBB18_94:
+; RV32I-NEXT: srl s7, a7, a5
+; RV32I-NEXT: .LBB18_95:
+; RV32I-NEXT: mv t6, s8
+; RV32I-NEXT: mv s8, s11
+; RV32I-NEXT: bltu s10, t5, .LBB18_97
+; RV32I-NEXT: # %bb.96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t2, s10
+; RV32I-NEXT: j .LBB18_98
+; RV32I-NEXT: .LBB18_97:
+; RV32I-NEXT: sll s9, t2, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t2, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: .LBB18_98:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, a4
+; RV32I-NEXT: beqz s10, .LBB18_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB18_100:
+; RV32I-NEXT: bltu s11, t5, .LBB18_102
+; RV32I-NEXT: # %bb.101:
+; RV32I-NEXT: sra a3, a4, s11
+; RV32I-NEXT: bnez s11, .LBB18_103
+; RV32I-NEXT: j .LBB18_104
+; RV32I-NEXT: .LBB18_102:
+; RV32I-NEXT: srl a3, t2, ra
+; RV32I-NEXT: mv s10, s4
+; RV32I-NEXT: neg s4, s11
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s4, s10
+; RV32I-NEXT: beqz s11, .LBB18_104
+; RV32I-NEXT: .LBB18_103:
+; RV32I-NEXT: mv t2, a3
+; RV32I-NEXT: .LBB18_104:
+; RV32I-NEXT: bltu s11, t5, .LBB18_106
+; RV32I-NEXT: # %bb.105:
+; RV32I-NEXT: srai t5, a4, 31
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bltu ra, t0, .LBB18_107
+; RV32I-NEXT: j .LBB18_108
+; RV32I-NEXT: .LBB18_106:
+; RV32I-NEXT: sra t5, a4, ra
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bgeu ra, t0, .LBB18_108
+; RV32I-NEXT: .LBB18_107:
+; RV32I-NEXT: or t2, a6, s9
+; RV32I-NEXT: or t5, s7, s3
+; RV32I-NEXT: .LBB18_108:
+; RV32I-NEXT: li a6, 128
+; RV32I-NEXT: bnez ra, .LBB18_117
+; RV32I-NEXT: # %bb.109:
+; RV32I-NEXT: bgeu ra, t0, .LBB18_118
+; RV32I-NEXT: .LBB18_110:
+; RV32I-NEXT: bgeu a5, a6, .LBB18_112
+; RV32I-NEXT: .LBB18_111:
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a3, s8
+; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a7, a3, t6
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s4
+; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: .LBB18_112:
+; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: mv t0, s5
+; RV32I-NEXT: beqz a5, .LBB18_114
+; RV32I-NEXT: # %bb.113:
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t3, s2
+; RV32I-NEXT: .LBB18_114:
+; RV32I-NEXT: bltu a5, a6, .LBB18_116
+; RV32I-NEXT: # %bb.115:
+; RV32I-NEXT: srai a1, a4, 31
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: mv s11, a1
+; RV32I-NEXT: mv ra, a1
+; RV32I-NEXT: .LBB18_116:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli a7, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t2, t3, 16
+; RV32I-NEXT: srli s3, t3, 24
+; RV32I-NEXT: srli s1, a1, 16
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: srli t6, t0, 16
+; RV32I-NEXT: srli s6, t0, 24
+; RV32I-NEXT: srli s5, s11, 16
+; RV32I-NEXT: srli s4, s11, 24
+; RV32I-NEXT: srli s7, ra, 16
+; RV32I-NEXT: srli s8, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s9, s0, t1
+; RV32I-NEXT: and s10, a0, t1
+; RV32I-NEXT: srli s9, s9, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s9, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli a7, s10, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t3, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, a1, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t3, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, t0, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a1, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a1, s11, t1
+; RV32I-NEXT: and a3, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb s11, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s4, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a3, 29(a2)
+; RV32I-NEXT: sb s7, 30(a2)
+; RV32I-NEXT: sb s8, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB18_117:
+; RV32I-NEXT: mv t1, t2
+; RV32I-NEXT: mv a7, t5
+; RV32I-NEXT: bltu ra, t0, .LBB18_110
+; RV32I-NEXT: .LBB18_118:
+; RV32I-NEXT: srai s1, a4, 31
+; RV32I-NEXT: mv s2, s1
+; RV32I-NEXT: bltu a5, a6, .LBB18_111
+; RV32I-NEXT: j .LBB18_112
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or a7, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t4, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli t3, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a7, t1, a7
+; RV64I-NEXT: or a5, t5, t4
+; RV64I-NEXT: or a6, s0, t6
+; RV64I-NEXT: slli a6, a6, 5
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sll t4, a5, t5
+; RV64I-NEXT: bltu a6, t0, .LBB19_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sra t6, a5, t1
+; RV64I-NEXT: j .LBB19_3
+; RV64I-NEXT: .LBB19_2:
+; RV64I-NEXT: srl t6, a7, a6
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: .LBB19_3:
+; RV64I-NEXT: or a3, t3, a3
+; RV64I-NEXT: slli t3, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a7
+; RV64I-NEXT: beqz a6, .LBB19_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB19_5:
+; RV64I-NEXT: or a4, t3, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a6, t0, .LBB19_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: srai a0, a5, 63
+; RV64I-NEXT: srl t3, a3, t1
+; RV64I-NEXT: j .LBB19_8
+; RV64I-NEXT: .LBB19_7:
+; RV64I-NEXT: sra a0, a5, a6
+; RV64I-NEXT: srl t1, a4, a6
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t3, t1, t2
+; RV64I-NEXT: .LBB19_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a6, .LBB19_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t3
+; RV64I-NEXT: .LBB19_10:
+; RV64I-NEXT: sub t6, t1, a6
+; RV64I-NEXT: bltu a6, t0, .LBB19_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t3, 0
+; RV64I-NEXT: bgeu t6, t0, .LBB19_14
+; RV64I-NEXT: .LBB19_12:
+; RV64I-NEXT: sll t5, a7, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a7, s0
+; RV64I-NEXT: or s1, s0, t4
+; RV64I-NEXT: j .LBB19_15
+; RV64I-NEXT: .LBB19_13:
+; RV64I-NEXT: srl t3, a3, a6
+; RV64I-NEXT: bltu t6, t0, .LBB19_12
+; RV64I-NEXT: .LBB19_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: sll s1, a7, t4
+; RV64I-NEXT: .LBB19_15:
+; RV64I-NEXT: sub s0, a6, t1
+; RV64I-NEXT: mv t4, a5
+; RV64I-NEXT: beqz t6, .LBB19_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t4, s1
+; RV64I-NEXT: .LBB19_17:
+; RV64I-NEXT: bltu s0, t0, .LBB19_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: sra t6, a5, t6
+; RV64I-NEXT: bnez s0, .LBB19_20
+; RV64I-NEXT: j .LBB19_21
+; RV64I-NEXT: .LBB19_19:
+; RV64I-NEXT: srl t6, a7, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, a5, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB19_21
+; RV64I-NEXT: .LBB19_20:
+; RV64I-NEXT: mv a7, t6
+; RV64I-NEXT: .LBB19_21:
+; RV64I-NEXT: bltu s0, t0, .LBB19_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: srai t0, a5, 63
+; RV64I-NEXT: bltu a6, t1, .LBB19_24
+; RV64I-NEXT: j .LBB19_25
+; RV64I-NEXT: .LBB19_23:
+; RV64I-NEXT: sra t0, a5, s0
+; RV64I-NEXT: bgeu a6, t1, .LBB19_25
+; RV64I-NEXT: .LBB19_24:
+; RV64I-NEXT: or a7, t2, t5
+; RV64I-NEXT: or t0, t3, t4
+; RV64I-NEXT: .LBB19_25:
+; RV64I-NEXT: bnez a6, .LBB19_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a6, t1, .LBB19_28
+; RV64I-NEXT: .LBB19_27:
+; RV64I-NEXT: srai a1, a5, 63
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: .LBB19_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB19_29:
+; RV64I-NEXT: mv a4, a7
+; RV64I-NEXT: mv a3, t0
+; RV64I-NEXT: bgeu a6, t1, .LBB19_27
+; RV64I-NEXT: j .LBB19_28
+;
+; RV32I-LABEL: ashr_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or t4, t6, t5
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t5, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: or s0, t5, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t5, 32
+; RV32I-NEXT: slli a7, a4, 16
+; RV32I-NEXT: slli a1, a5, 16
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a5, t2, 16
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or a4, t1, t0
+; RV32I-NEXT: or a5, a5, s0
+; RV32I-NEXT: slli a5, a5, 5
+; RV32I-NEXT: srl s0, t2, a5
+; RV32I-NEXT: neg s6, a5
+; RV32I-NEXT: sll s1, a4, s6
+; RV32I-NEXT: bltu a5, t5, .LBB19_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t0, a4, a5
+; RV32I-NEXT: j .LBB19_3
+; RV32I-NEXT: .LBB19_2:
+; RV32I-NEXT: or t0, s0, s1
+; RV32I-NEXT: .LBB19_3:
+; RV32I-NEXT: or t1, a7, a3
+; RV32I-NEXT: or a7, a1, a6
+; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: beqz a5, .LBB19_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t3, t0
+; RV32I-NEXT: .LBB19_5:
+; RV32I-NEXT: srl a3, t1, a5
+; RV32I-NEXT: sll a1, a7, s6
+; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a5, t5, .LBB19_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai t4, a4, 31
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: j .LBB19_8
+; RV32I-NEXT: .LBB19_7:
+; RV32I-NEXT: sra t4, a4, a5
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: .LBB19_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: beqz a5, .LBB19_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: .LBB19_10:
+; RV32I-NEXT: sub s7, t6, a5
+; RV32I-NEXT: bltu a5, t5, .LBB19_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB19_13
+; RV32I-NEXT: .LBB19_12:
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: .LBB19_13:
+; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: bltu s7, t5, .LBB19_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a3, t2, s7
+; RV32I-NEXT: j .LBB19_16
+; RV32I-NEXT: .LBB19_15:
+; RV32I-NEXT: sll a6, t2, s6
+; RV32I-NEXT: srl a3, t2, s10
+; RV32I-NEXT: or a3, a3, s1
+; RV32I-NEXT: .LBB19_16:
+; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi s9, a5, -64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz s7, .LBB19_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, a3
+; RV32I-NEXT: .LBB19_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t5, .LBB19_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: sra s0, a4, s9
+; RV32I-NEXT: j .LBB19_21
+; RV32I-NEXT: .LBB19_20:
+; RV32I-NEXT: sll a3, a4, s11
+; RV32I-NEXT: or s0, s0, a3
+; RV32I-NEXT: .LBB19_21:
+; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: beqz s9, .LBB19_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv t4, s0
+; RV32I-NEXT: .LBB19_23:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu s1, 10(a0)
+; RV32I-NEXT: lbu s8, 13(a0)
+; RV32I-NEXT: lbu ra, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: bltu s9, t5, .LBB19_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: srai s0, a4, 31
+; RV32I-NEXT: j .LBB19_26
+; RV32I-NEXT: .LBB19_25:
+; RV32I-NEXT: sra s0, a4, a5
+; RV32I-NEXT: .LBB19_26:
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: lbu s5, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s4, s8, 8
+; RV32I-NEXT: or s8, a3, ra
+; RV32I-NEXT: bgeu a5, t6, .LBB19_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or t4, t0, a6
+; RV32I-NEXT: or s0, a1, t3
+; RV32I-NEXT: .LBB19_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: or a6, s2, s5
+; RV32I-NEXT: slli s2, s1, 16
+; RV32I-NEXT: or s1, s4, s3
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: mv t0, a7
+; RV32I-NEXT: beqz a5, .LBB19_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv a1, t4
+; RV32I-NEXT: mv t0, s0
+; RV32I-NEXT: .LBB19_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s3, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s4, t3, 8
+; RV32I-NEXT: or t4, s2, a6
+; RV32I-NEXT: or t3, s8, s1
+; RV32I-NEXT: bltu a5, t6, .LBB19_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: srai a6, a4, 31
+; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB19_32:
+; RV32I-NEXT: slli a6, ra, 8
+; RV32I-NEXT: or a3, s5, a3
+; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s0, s4, s0
+; RV32I-NEXT: srl s2, t4, a5
+; RV32I-NEXT: sll ra, t3, s6
+; RV32I-NEXT: bltu a5, t5, .LBB19_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: j .LBB19_35
+; RV32I-NEXT: .LBB19_34:
+; RV32I-NEXT: or s4, s2, ra
+; RV32I-NEXT: .LBB19_35:
+; RV32I-NEXT: or a6, a6, s1
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: or a0, s3, a0
+; RV32I-NEXT: slli s1, s0, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a5, .LBB19_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s4
+; RV32I-NEXT: .LBB19_37:
+; RV32I-NEXT: or s0, a3, a6
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: bltu a5, t5, .LBB19_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: srl a3, a0, a5
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: bnez a5, .LBB19_40
+; RV32I-NEXT: j .LBB19_41
+; RV32I-NEXT: .LBB19_39:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: srl a3, s0, a5
+; RV32I-NEXT: sll a6, a0, s6
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: beqz a5, .LBB19_41
+; RV32I-NEXT: .LBB19_40:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB19_41:
+; RV32I-NEXT: bltu a5, t5, .LBB19_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t5, .LBB19_45
+; RV32I-NEXT: .LBB19_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: bnez s7, .LBB19_46
+; RV32I-NEXT: j .LBB19_47
+; RV32I-NEXT: .LBB19_44:
+; RV32I-NEXT: srl s1, a0, a5
+; RV32I-NEXT: bltu s7, t5, .LBB19_43
+; RV32I-NEXT: .LBB19_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: beqz s7, .LBB19_47
+; RV32I-NEXT: .LBB19_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB19_47:
+; RV32I-NEXT: bltu s9, t5, .LBB19_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t3, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB19_50
+; RV32I-NEXT: j .LBB19_51
+; RV32I-NEXT: .LBB19_49:
+; RV32I-NEXT: sll a3, t3, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB19_51
+; RV32I-NEXT: .LBB19_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB19_51:
+; RV32I-NEXT: bltu s9, t5, .LBB19_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a5, t6, .LBB19_54
+; RV32I-NEXT: j .LBB19_55
+; RV32I-NEXT: .LBB19_53:
+; RV32I-NEXT: srl s7, t3, a5
+; RV32I-NEXT: bgeu a5, t6, .LBB19_55
+; RV32I-NEXT: .LBB19_54:
+; RV32I-NEXT: or s2, a6, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB19_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a5, .LBB19_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a6, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB19_57:
+; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a5
+; RV32I-NEXT: bltu a5, t6, .LBB19_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: .LBB19_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a6, t1, s3
+; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB19_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: j .LBB19_62
+; RV32I-NEXT: .LBB19_61:
+; RV32I-NEXT: sll s11, t1, s6
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: .LBB19_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a7
+; RV32I-NEXT: beqz s2, .LBB19_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB19_64:
+; RV32I-NEXT: bltu s1, t5, .LBB19_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a7, s1
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez s1, .LBB19_67
+; RV32I-NEXT: j .LBB19_68
+; RV32I-NEXT: .LBB19_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a7, a3
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz s1, .LBB19_68
+; RV32I-NEXT: .LBB19_67:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB19_68:
+; RV32I-NEXT: bltu s1, t5, .LBB19_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t5, .LBB19_72
+; RV32I-NEXT: .LBB19_70:
+; RV32I-NEXT: sll s6, t2, s6
+; RV32I-NEXT: srl a3, t2, s3
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB19_73
+; RV32I-NEXT: .LBB19_71:
+; RV32I-NEXT: srl s1, a7, s3
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB19_70
+; RV32I-NEXT: .LBB19_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t2, s2
+; RV32I-NEXT: .LBB19_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, a4
+; RV32I-NEXT: beqz s2, .LBB19_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB19_75:
+; RV32I-NEXT: bltu s9, t5, .LBB19_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t1, s9
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: bnez s9, .LBB19_78
+; RV32I-NEXT: j .LBB19_79
+; RV32I-NEXT: .LBB19_77:
+; RV32I-NEXT: sll s3, t1, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: sll s4, a7, s2
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: beqz s9, .LBB19_79
+; RV32I-NEXT: .LBB19_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB19_79:
+; RV32I-NEXT: bltu s2, t6, .LBB19_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB19_82
+; RV32I-NEXT: .LBB19_81:
+; RV32I-NEXT: or s3, a6, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB19_82:
+; RV32I-NEXT: addi ra, a5, -128
+; RV32I-NEXT: mv s4, t2
+; RV32I-NEXT: mv s6, a4
+; RV32I-NEXT: beqz s2, .LBB19_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s4, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB19_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, a4, s9
+; RV32I-NEXT: bltu ra, t5, .LBB19_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: sra a3, a4, ra
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: bnez ra, .LBB19_87
+; RV32I-NEXT: j .LBB19_88
+; RV32I-NEXT: .LBB19_86:
+; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: beqz ra, .LBB19_88
+; RV32I-NEXT: .LBB19_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB19_88:
+; RV32I-NEXT: bltu ra, t5, .LBB19_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: srai s2, a4, 31
+; RV32I-NEXT: srl a3, a7, ra
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez ra, .LBB19_91
+; RV32I-NEXT: j .LBB19_92
+; RV32I-NEXT: .LBB19_90:
+; RV32I-NEXT: sra s2, a4, a5
+; RV32I-NEXT: sll a3, a7, s9
+; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz ra, .LBB19_92
+; RV32I-NEXT: .LBB19_91:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB19_92:
+; RV32I-NEXT: mv s5, t0
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: bltu ra, t5, .LBB19_94
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: j .LBB19_95
+; RV32I-NEXT: .LBB19_94:
+; RV32I-NEXT: srl s7, a7, a5
+; RV32I-NEXT: .LBB19_95:
+; RV32I-NEXT: mv t6, s8
+; RV32I-NEXT: mv s8, s11
+; RV32I-NEXT: bltu s10, t5, .LBB19_97
+; RV32I-NEXT: # %bb.96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t2, s10
+; RV32I-NEXT: j .LBB19_98
+; RV32I-NEXT: .LBB19_97:
+; RV32I-NEXT: sll s9, t2, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t2, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: .LBB19_98:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, a4
+; RV32I-NEXT: beqz s10, .LBB19_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB19_100:
+; RV32I-NEXT: bltu s11, t5, .LBB19_102
+; RV32I-NEXT: # %bb.101:
+; RV32I-NEXT: sra a3, a4, s11
+; RV32I-NEXT: bnez s11, .LBB19_103
+; RV32I-NEXT: j .LBB19_104
+; RV32I-NEXT: .LBB19_102:
+; RV32I-NEXT: srl a3, t2, ra
+; RV32I-NEXT: mv s10, s4
+; RV32I-NEXT: neg s4, s11
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s4, s10
+; RV32I-NEXT: beqz s11, .LBB19_104
+; RV32I-NEXT: .LBB19_103:
+; RV32I-NEXT: mv t2, a3
+; RV32I-NEXT: .LBB19_104:
+; RV32I-NEXT: bltu s11, t5, .LBB19_106
+; RV32I-NEXT: # %bb.105:
+; RV32I-NEXT: srai t5, a4, 31
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bltu ra, t0, .LBB19_107
+; RV32I-NEXT: j .LBB19_108
+; RV32I-NEXT: .LBB19_106:
+; RV32I-NEXT: sra t5, a4, ra
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bgeu ra, t0, .LBB19_108
+; RV32I-NEXT: .LBB19_107:
+; RV32I-NEXT: or t2, a6, s9
+; RV32I-NEXT: or t5, s7, s3
+; RV32I-NEXT: .LBB19_108:
+; RV32I-NEXT: li a6, 128
+; RV32I-NEXT: bnez ra, .LBB19_117
+; RV32I-NEXT: # %bb.109:
+; RV32I-NEXT: bgeu ra, t0, .LBB19_118
+; RV32I-NEXT: .LBB19_110:
+; RV32I-NEXT: bgeu a5, a6, .LBB19_112
+; RV32I-NEXT: .LBB19_111:
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a3, s8
+; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a7, a3, t6
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s4
+; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: .LBB19_112:
+; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: mv t0, s5
+; RV32I-NEXT: beqz a5, .LBB19_114
+; RV32I-NEXT: # %bb.113:
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t3, s2
+; RV32I-NEXT: .LBB19_114:
+; RV32I-NEXT: bltu a5, a6, .LBB19_116
+; RV32I-NEXT: # %bb.115:
+; RV32I-NEXT: srai a1, a4, 31
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: mv s11, a1
+; RV32I-NEXT: mv ra, a1
+; RV32I-NEXT: .LBB19_116:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli a7, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t2, t3, 16
+; RV32I-NEXT: srli s3, t3, 24
+; RV32I-NEXT: srli s1, a1, 16
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: srli t6, t0, 16
+; RV32I-NEXT: srli s6, t0, 24
+; RV32I-NEXT: srli s5, s11, 16
+; RV32I-NEXT: srli s4, s11, 24
+; RV32I-NEXT: srli s7, ra, 16
+; RV32I-NEXT: srli s8, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s9, s0, t1
+; RV32I-NEXT: and s10, a0, t1
+; RV32I-NEXT: srli s9, s9, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s9, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli a7, s10, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t3, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, a1, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t3, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, t0, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a1, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a1, s11, t1
+; RV32I-NEXT: and a3, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb s11, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s4, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a3, 29(a2)
+; RV32I-NEXT: sb s7, 30(a2)
+; RV32I-NEXT: sb s8, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB19_117:
+; RV32I-NEXT: mv t1, t2
+; RV32I-NEXT: mv a7, t5
+; RV32I-NEXT: bltu ra, t0, .LBB19_110
+; RV32I-NEXT: .LBB19_118:
+; RV32I-NEXT: srai s1, a4, 31
+; RV32I-NEXT: mv s2, s1
+; RV32I-NEXT: bltu a5, a6, .LBB19_111
+; RV32I-NEXT: j .LBB19_112
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -96
+; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t2, t2, 8
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or t1, t2, t1
+; RV64I-NEXT: or a4, t4, t3
+; RV64I-NEXT: or a6, t6, t5
+; RV64I-NEXT: or t0, s1, s0
+; RV64I-NEXT: lbu t5, 24(a0)
+; RV64I-NEXT: lbu t6, 25(a0)
+; RV64I-NEXT: lbu s0, 26(a0)
+; RV64I-NEXT: lbu s1, 27(a0)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: or t4, s3, s2
+; RV64I-NEXT: or t2, s5, s4
+; RV64I-NEXT: or t3, s7, s6
+; RV64I-NEXT: lbu s2, 28(a0)
+; RV64I-NEXT: lbu s3, 29(a0)
+; RV64I-NEXT: lbu s4, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s5, s9, s8
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: or t5, t6, t5
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: lbu t6, 0(a1)
+; RV64I-NEXT: lbu s1, 1(a1)
+; RV64I-NEXT: lbu s7, 2(a1)
+; RV64I-NEXT: lbu s8, 3(a1)
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli a0, a0, 8
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, a0, s4
+; RV64I-NEXT: or t6, s1, t6
+; RV64I-NEXT: lbu a0, 4(a1)
+; RV64I-NEXT: lbu s1, 5(a1)
+; RV64I-NEXT: lbu s4, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: or s7, s8, s7
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: or s1, s1, a0
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: or s4, a1, s4
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: or a1, t1, a7
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: or a0, t4, t0
+; RV64I-NEXT: slli t3, t3, 16
+; RV64I-NEXT: or a7, t3, t2
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: or t1, s6, s5
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: or t4, s0, t5
+; RV64I-NEXT: slli s3, s3, 16
+; RV64I-NEXT: or t5, s3, s2
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: or t6, s7, t6
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: or s0, s4, s1
+; RV64I-NEXT: li t0, 64
+; RV64I-NEXT: slli t3, a5, 16
+; RV64I-NEXT: slli t2, a6, 16
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli t5, t5, 32
+; RV64I-NEXT: slli s0, s0, 32
+; RV64I-NEXT: or a7, t1, a7
+; RV64I-NEXT: or a5, t5, t4
+; RV64I-NEXT: or a6, s0, t6
+; RV64I-NEXT: slli a6, a6, 6
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sll t4, a5, t5
+; RV64I-NEXT: bltu a6, t0, .LBB20_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sra t6, a5, t1
+; RV64I-NEXT: j .LBB20_3
+; RV64I-NEXT: .LBB20_2:
+; RV64I-NEXT: srl t6, a7, a6
+; RV64I-NEXT: or t6, t6, t4
+; RV64I-NEXT: .LBB20_3:
+; RV64I-NEXT: or a3, t3, a3
+; RV64I-NEXT: slli t3, a1, 32
+; RV64I-NEXT: or t2, t2, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mv a1, a7
+; RV64I-NEXT: beqz a6, .LBB20_5
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv a1, t6
+; RV64I-NEXT: .LBB20_5:
+; RV64I-NEXT: or a4, t3, a3
+; RV64I-NEXT: or a3, a0, t2
+; RV64I-NEXT: bltu a6, t0, .LBB20_7
+; RV64I-NEXT: # %bb.6:
+; RV64I-NEXT: srai a0, a5, 63
+; RV64I-NEXT: srl t3, a3, t1
+; RV64I-NEXT: j .LBB20_8
+; RV64I-NEXT: .LBB20_7:
+; RV64I-NEXT: sra a0, a5, a6
+; RV64I-NEXT: srl t1, a4, a6
+; RV64I-NEXT: sll t2, a3, t5
+; RV64I-NEXT: or t3, t1, t2
+; RV64I-NEXT: .LBB20_8:
+; RV64I-NEXT: li t1, 128
+; RV64I-NEXT: mv t2, a4
+; RV64I-NEXT: beqz a6, .LBB20_10
+; RV64I-NEXT: # %bb.9:
+; RV64I-NEXT: mv t2, t3
+; RV64I-NEXT: .LBB20_10:
+; RV64I-NEXT: sub t6, t1, a6
+; RV64I-NEXT: bltu a6, t0, .LBB20_13
+; RV64I-NEXT: # %bb.11:
+; RV64I-NEXT: li t3, 0
+; RV64I-NEXT: bgeu t6, t0, .LBB20_14
+; RV64I-NEXT: .LBB20_12:
+; RV64I-NEXT: sll t5, a7, t5
+; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: srl s0, a7, s0
+; RV64I-NEXT: or s1, s0, t4
+; RV64I-NEXT: j .LBB20_15
+; RV64I-NEXT: .LBB20_13:
+; RV64I-NEXT: srl t3, a3, a6
+; RV64I-NEXT: bltu t6, t0, .LBB20_12
+; RV64I-NEXT: .LBB20_14:
+; RV64I-NEXT: li t5, 0
+; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: sll s1, a7, t4
+; RV64I-NEXT: .LBB20_15:
+; RV64I-NEXT: sub s0, a6, t1
+; RV64I-NEXT: mv t4, a5
+; RV64I-NEXT: beqz t6, .LBB20_17
+; RV64I-NEXT: # %bb.16:
+; RV64I-NEXT: mv t4, s1
+; RV64I-NEXT: .LBB20_17:
+; RV64I-NEXT: bltu s0, t0, .LBB20_19
+; RV64I-NEXT: # %bb.18:
+; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: sra t6, a5, t6
+; RV64I-NEXT: bnez s0, .LBB20_20
+; RV64I-NEXT: j .LBB20_21
+; RV64I-NEXT: .LBB20_19:
+; RV64I-NEXT: srl t6, a7, s0
+; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: sll s1, a5, s1
+; RV64I-NEXT: or t6, t6, s1
+; RV64I-NEXT: beqz s0, .LBB20_21
+; RV64I-NEXT: .LBB20_20:
+; RV64I-NEXT: mv a7, t6
+; RV64I-NEXT: .LBB20_21:
+; RV64I-NEXT: bltu s0, t0, .LBB20_23
+; RV64I-NEXT: # %bb.22:
+; RV64I-NEXT: srai t0, a5, 63
+; RV64I-NEXT: bltu a6, t1, .LBB20_24
+; RV64I-NEXT: j .LBB20_25
+; RV64I-NEXT: .LBB20_23:
+; RV64I-NEXT: sra t0, a5, s0
+; RV64I-NEXT: bgeu a6, t1, .LBB20_25
+; RV64I-NEXT: .LBB20_24:
+; RV64I-NEXT: or a7, t2, t5
+; RV64I-NEXT: or t0, t3, t4
+; RV64I-NEXT: .LBB20_25:
+; RV64I-NEXT: bnez a6, .LBB20_29
+; RV64I-NEXT: # %bb.26:
+; RV64I-NEXT: bltu a6, t1, .LBB20_28
+; RV64I-NEXT: .LBB20_27:
+; RV64I-NEXT: srai a1, a5, 63
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: .LBB20_28:
+; RV64I-NEXT: srli a5, a4, 32
+; RV64I-NEXT: srliw a6, a4, 16
+; RV64I-NEXT: lui t2, 16
+; RV64I-NEXT: srliw t1, a4, 24
+; RV64I-NEXT: srli t0, a4, 48
+; RV64I-NEXT: srli t5, a4, 56
+; RV64I-NEXT: srli a7, a3, 32
+; RV64I-NEXT: srliw t4, a3, 16
+; RV64I-NEXT: srliw s0, a3, 24
+; RV64I-NEXT: srli t6, a3, 48
+; RV64I-NEXT: srli s3, a3, 56
+; RV64I-NEXT: srli t3, a1, 32
+; RV64I-NEXT: srliw s2, a1, 16
+; RV64I-NEXT: srliw s6, a1, 24
+; RV64I-NEXT: srli s4, a1, 48
+; RV64I-NEXT: srli s7, a1, 56
+; RV64I-NEXT: srli s1, a0, 32
+; RV64I-NEXT: srliw s5, a0, 16
+; RV64I-NEXT: srliw s8, a0, 24
+; RV64I-NEXT: srli s9, a0, 48
+; RV64I-NEXT: srli s10, a0, 56
+; RV64I-NEXT: addi t2, t2, -1
+; RV64I-NEXT: and s11, a4, t2
+; RV64I-NEXT: srli s11, s11, 8
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s11, 1(a2)
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: sb t1, 3(a2)
+; RV64I-NEXT: and a4, a5, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a5, 4(a2)
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: sb t0, 6(a2)
+; RV64I-NEXT: sb t5, 7(a2)
+; RV64I-NEXT: and a4, a3, t2
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a3, 8(a2)
+; RV64I-NEXT: sb a4, 9(a2)
+; RV64I-NEXT: sb t4, 10(a2)
+; RV64I-NEXT: sb s0, 11(a2)
+; RV64I-NEXT: and a3, a7, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a7, 12(a2)
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: sb t6, 14(a2)
+; RV64I-NEXT: sb s3, 15(a2)
+; RV64I-NEXT: and a3, a1, t2
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: sb s2, 18(a2)
+; RV64I-NEXT: sb s6, 19(a2)
+; RV64I-NEXT: and a1, t3, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb t3, 20(a2)
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: sb s4, 22(a2)
+; RV64I-NEXT: sb s7, 23(a2)
+; RV64I-NEXT: and a1, a0, t2
+; RV64I-NEXT: and a3, s1, t2
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a0, 24(a2)
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb s5, 26(a2)
+; RV64I-NEXT: sb s8, 27(a2)
+; RV64I-NEXT: sb s1, 28(a2)
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: sb s9, 30(a2)
+; RV64I-NEXT: sb s10, 31(a2)
+; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 96
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB20_29:
+; RV64I-NEXT: mv a4, a7
+; RV64I-NEXT: mv a3, t0
+; RV64I-NEXT: bgeu a6, t1, .LBB20_27
+; RV64I-NEXT: j .LBB20_28
+;
+; RV32I-LABEL: ashr_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -96
+; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu a4, 17(a0)
+; RV32I-NEXT: lbu a5, 18(a0)
+; RV32I-NEXT: lbu a6, 19(a0)
+; RV32I-NEXT: lbu a7, 20(a0)
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 22(a0)
+; RV32I-NEXT: lbu t2, 23(a0)
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t4, 25(a0)
+; RV32I-NEXT: lbu t5, 26(a0)
+; RV32I-NEXT: lbu t6, 27(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, t2, t1
+; RV32I-NEXT: lbu a7, 28(a0)
+; RV32I-NEXT: lbu t0, 29(a0)
+; RV32I-NEXT: lbu t1, 30(a0)
+; RV32I-NEXT: lbu t2, 31(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or t4, t6, t5
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t5, 1(a1)
+; RV32I-NEXT: lbu t6, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: or s0, t5, a7
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or t2, a1, t6
+; RV32I-NEXT: li t5, 32
+; RV32I-NEXT: slli a7, a4, 16
+; RV32I-NEXT: slli a1, a5, 16
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a5, t2, 16
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or a4, t1, t0
+; RV32I-NEXT: or a5, a5, s0
+; RV32I-NEXT: slli a5, a5, 6
+; RV32I-NEXT: srl s0, t2, a5
+; RV32I-NEXT: neg s6, a5
+; RV32I-NEXT: sll s1, a4, s6
+; RV32I-NEXT: bltu a5, t5, .LBB20_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sra t0, a4, a5
+; RV32I-NEXT: j .LBB20_3
+; RV32I-NEXT: .LBB20_2:
+; RV32I-NEXT: or t0, s0, s1
+; RV32I-NEXT: .LBB20_3:
+; RV32I-NEXT: or t1, a7, a3
+; RV32I-NEXT: or a7, a1, a6
+; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: beqz a5, .LBB20_5
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv t3, t0
+; RV32I-NEXT: .LBB20_5:
+; RV32I-NEXT: srl a3, t1, a5
+; RV32I-NEXT: sll a1, a7, s6
+; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu a5, t5, .LBB20_7
+; RV32I-NEXT: # %bb.6:
+; RV32I-NEXT: srai t4, a4, 31
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: j .LBB20_8
+; RV32I-NEXT: .LBB20_7:
+; RV32I-NEXT: sra t4, a4, a5
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: .LBB20_8:
+; RV32I-NEXT: li t6, 64
+; RV32I-NEXT: mv t0, t1
+; RV32I-NEXT: beqz a5, .LBB20_10
+; RV32I-NEXT: # %bb.9:
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: .LBB20_10:
+; RV32I-NEXT: sub s7, t6, a5
+; RV32I-NEXT: bltu a5, t5, .LBB20_12
+; RV32I-NEXT: # %bb.11:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: j .LBB20_13
+; RV32I-NEXT: .LBB20_12:
+; RV32I-NEXT: srl a1, a7, a5
+; RV32I-NEXT: .LBB20_13:
+; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: neg s10, s7
+; RV32I-NEXT: bltu s7, t5, .LBB20_15
+; RV32I-NEXT: # %bb.14:
+; RV32I-NEXT: li a6, 0
+; RV32I-NEXT: sll a3, t2, s7
+; RV32I-NEXT: j .LBB20_16
+; RV32I-NEXT: .LBB20_15:
+; RV32I-NEXT: sll a6, t2, s6
+; RV32I-NEXT: srl a3, t2, s10
+; RV32I-NEXT: or a3, a3, s1
+; RV32I-NEXT: .LBB20_16:
+; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi s9, a5, -64
+; RV32I-NEXT: mv t3, a4
+; RV32I-NEXT: beqz s7, .LBB20_18
+; RV32I-NEXT: # %bb.17:
+; RV32I-NEXT: mv t3, a3
+; RV32I-NEXT: .LBB20_18:
+; RV32I-NEXT: neg s11, s9
+; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s9, t5, .LBB20_20
+; RV32I-NEXT: # %bb.19:
+; RV32I-NEXT: sra s0, a4, s9
+; RV32I-NEXT: j .LBB20_21
+; RV32I-NEXT: .LBB20_20:
+; RV32I-NEXT: sll a3, a4, s11
+; RV32I-NEXT: or s0, s0, a3
+; RV32I-NEXT: .LBB20_21:
+; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s3, 11(a0)
+; RV32I-NEXT: lbu a3, 15(a0)
+; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: beqz s9, .LBB20_23
+; RV32I-NEXT: # %bb.22:
+; RV32I-NEXT: mv t4, s0
+; RV32I-NEXT: .LBB20_23:
+; RV32I-NEXT: lbu s2, 9(a0)
+; RV32I-NEXT: lbu s1, 10(a0)
+; RV32I-NEXT: lbu s8, 13(a0)
+; RV32I-NEXT: lbu ra, 14(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: bltu s9, t5, .LBB20_25
+; RV32I-NEXT: # %bb.24:
+; RV32I-NEXT: srai s0, a4, 31
+; RV32I-NEXT: j .LBB20_26
+; RV32I-NEXT: .LBB20_25:
+; RV32I-NEXT: sra s0, a4, a5
+; RV32I-NEXT: .LBB20_26:
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: lbu s5, 8(a0)
+; RV32I-NEXT: lbu s3, 12(a0)
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s4, s8, 8
+; RV32I-NEXT: or s8, a3, ra
+; RV32I-NEXT: bgeu a5, t6, .LBB20_28
+; RV32I-NEXT: # %bb.27:
+; RV32I-NEXT: or t4, t0, a6
+; RV32I-NEXT: or s0, a1, t3
+; RV32I-NEXT: .LBB20_28:
+; RV32I-NEXT: lbu a3, 3(a0)
+; RV32I-NEXT: lbu t3, 7(a0)
+; RV32I-NEXT: or a6, s2, s5
+; RV32I-NEXT: slli s2, s1, 16
+; RV32I-NEXT: or s1, s4, s3
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: mv t0, a7
+; RV32I-NEXT: beqz a5, .LBB20_30
+; RV32I-NEXT: # %bb.29:
+; RV32I-NEXT: mv a1, t4
+; RV32I-NEXT: mv t0, s0
+; RV32I-NEXT: .LBB20_30:
+; RV32I-NEXT: slli s5, a3, 8
+; RV32I-NEXT: lbu ra, 1(a0)
+; RV32I-NEXT: lbu a3, 2(a0)
+; RV32I-NEXT: lbu s3, 5(a0)
+; RV32I-NEXT: lbu s0, 6(a0)
+; RV32I-NEXT: slli s4, t3, 8
+; RV32I-NEXT: or t4, s2, a6
+; RV32I-NEXT: or t3, s8, s1
+; RV32I-NEXT: bltu a5, t6, .LBB20_32
+; RV32I-NEXT: # %bb.31:
+; RV32I-NEXT: srai a6, a4, 31
+; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .LBB20_32:
+; RV32I-NEXT: slli a6, ra, 8
+; RV32I-NEXT: or a3, s5, a3
+; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s0, s4, s0
+; RV32I-NEXT: srl s2, t4, a5
+; RV32I-NEXT: sll ra, t3, s6
+; RV32I-NEXT: bltu a5, t5, .LBB20_34
+; RV32I-NEXT: # %bb.33:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: j .LBB20_35
+; RV32I-NEXT: .LBB20_34:
+; RV32I-NEXT: or s4, s2, ra
+; RV32I-NEXT: .LBB20_35:
+; RV32I-NEXT: or a6, a6, s1
+; RV32I-NEXT: slli a3, a3, 16
+; RV32I-NEXT: or a0, s3, a0
+; RV32I-NEXT: slli s1, s0, 16
+; RV32I-NEXT: mv s5, t4
+; RV32I-NEXT: beqz a5, .LBB20_37
+; RV32I-NEXT: # %bb.36:
+; RV32I-NEXT: mv s5, s4
+; RV32I-NEXT: .LBB20_37:
+; RV32I-NEXT: or s0, a3, a6
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: bltu a5, t5, .LBB20_39
+; RV32I-NEXT: # %bb.38:
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: srl a3, a0, a5
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: bnez a5, .LBB20_40
+; RV32I-NEXT: j .LBB20_41
+; RV32I-NEXT: .LBB20_39:
+; RV32I-NEXT: srl s4, t3, a5
+; RV32I-NEXT: srl a3, s0, a5
+; RV32I-NEXT: sll a6, a0, s6
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: beqz a5, .LBB20_41
+; RV32I-NEXT: .LBB20_40:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB20_41:
+; RV32I-NEXT: bltu a5, t5, .LBB20_44
+; RV32I-NEXT: # %bb.42:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: bgeu s7, t5, .LBB20_45
+; RV32I-NEXT: .LBB20_43:
+; RV32I-NEXT: sll s3, t4, s6
+; RV32I-NEXT: srl a3, t4, s10
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: bnez s7, .LBB20_46
+; RV32I-NEXT: j .LBB20_47
+; RV32I-NEXT: .LBB20_44:
+; RV32I-NEXT: srl s1, a0, a5
+; RV32I-NEXT: bltu s7, t5, .LBB20_43
+; RV32I-NEXT: .LBB20_45:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t4, s7
+; RV32I-NEXT: mv s10, t3
+; RV32I-NEXT: beqz s7, .LBB20_47
+; RV32I-NEXT: .LBB20_46:
+; RV32I-NEXT: mv s10, a3
+; RV32I-NEXT: .LBB20_47:
+; RV32I-NEXT: bltu s9, t5, .LBB20_49
+; RV32I-NEXT: # %bb.48:
+; RV32I-NEXT: srl a3, t3, s9
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: bnez s9, .LBB20_50
+; RV32I-NEXT: j .LBB20_51
+; RV32I-NEXT: .LBB20_49:
+; RV32I-NEXT: sll a3, t3, s11
+; RV32I-NEXT: or a3, s2, a3
+; RV32I-NEXT: mv s2, t4
+; RV32I-NEXT: beqz s9, .LBB20_51
+; RV32I-NEXT: .LBB20_50:
+; RV32I-NEXT: mv s2, a3
+; RV32I-NEXT: .LBB20_51:
+; RV32I-NEXT: bltu s9, t5, .LBB20_53
+; RV32I-NEXT: # %bb.52:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: bltu a5, t6, .LBB20_54
+; RV32I-NEXT: j .LBB20_55
+; RV32I-NEXT: .LBB20_53:
+; RV32I-NEXT: srl s7, t3, a5
+; RV32I-NEXT: bgeu a5, t6, .LBB20_55
+; RV32I-NEXT: .LBB20_54:
+; RV32I-NEXT: or s2, a6, s3
+; RV32I-NEXT: or s7, s1, s10
+; RV32I-NEXT: .LBB20_55:
+; RV32I-NEXT: li a3, 128
+; RV32I-NEXT: mv a6, s0
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: beqz a5, .LBB20_57
+; RV32I-NEXT: # %bb.56:
+; RV32I-NEXT: mv a6, s2
+; RV32I-NEXT: mv s1, s7
+; RV32I-NEXT: .LBB20_57:
+; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sub s2, a3, a5
+; RV32I-NEXT: bltu a5, t6, .LBB20_59
+; RV32I-NEXT: # %bb.58:
+; RV32I-NEXT: li s5, 0
+; RV32I-NEXT: li s4, 0
+; RV32I-NEXT: .LBB20_59:
+; RV32I-NEXT: neg s3, s2
+; RV32I-NEXT: srl a6, t1, s3
+; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB20_61
+; RV32I-NEXT: # %bb.60:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: sll a3, t1, s2
+; RV32I-NEXT: j .LBB20_62
+; RV32I-NEXT: .LBB20_61:
+; RV32I-NEXT: sll s11, t1, s6
+; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: .LBB20_62:
+; RV32I-NEXT: sub s1, t6, s2
+; RV32I-NEXT: mv s8, a7
+; RV32I-NEXT: beqz s2, .LBB20_64
+; RV32I-NEXT: # %bb.63:
+; RV32I-NEXT: mv s8, a3
+; RV32I-NEXT: .LBB20_64:
+; RV32I-NEXT: bltu s1, t5, .LBB20_66
+; RV32I-NEXT: # %bb.65:
+; RV32I-NEXT: srl a3, a7, s1
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez s1, .LBB20_67
+; RV32I-NEXT: j .LBB20_68
+; RV32I-NEXT: .LBB20_66:
+; RV32I-NEXT: neg a3, s1
+; RV32I-NEXT: sll a3, a7, a3
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz s1, .LBB20_68
+; RV32I-NEXT: .LBB20_67:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB20_68:
+; RV32I-NEXT: bltu s1, t5, .LBB20_71
+; RV32I-NEXT: # %bb.69:
+; RV32I-NEXT: li s1, 0
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bgeu s2, t5, .LBB20_72
+; RV32I-NEXT: .LBB20_70:
+; RV32I-NEXT: sll s6, t2, s6
+; RV32I-NEXT: srl a3, t2, s3
+; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: j .LBB20_73
+; RV32I-NEXT: .LBB20_71:
+; RV32I-NEXT: srl s1, a7, s3
+; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: bltu s2, t5, .LBB20_70
+; RV32I-NEXT: .LBB20_72:
+; RV32I-NEXT: li s6, 0
+; RV32I-NEXT: sll a3, t2, s2
+; RV32I-NEXT: .LBB20_73:
+; RV32I-NEXT: addi s9, s2, -64
+; RV32I-NEXT: mv s5, a4
+; RV32I-NEXT: beqz s2, .LBB20_75
+; RV32I-NEXT: # %bb.74:
+; RV32I-NEXT: mv s5, a3
+; RV32I-NEXT: .LBB20_75:
+; RV32I-NEXT: bltu s9, t5, .LBB20_77
+; RV32I-NEXT: # %bb.76:
+; RV32I-NEXT: li s3, 0
+; RV32I-NEXT: sll a3, t1, s9
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: bnez s9, .LBB20_78
+; RV32I-NEXT: j .LBB20_79
+; RV32I-NEXT: .LBB20_77:
+; RV32I-NEXT: sll s3, t1, s2
+; RV32I-NEXT: neg a3, s9
+; RV32I-NEXT: srl a3, t1, a3
+; RV32I-NEXT: sll s4, a7, s2
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s7, a7
+; RV32I-NEXT: beqz s9, .LBB20_79
+; RV32I-NEXT: .LBB20_78:
+; RV32I-NEXT: mv s7, a3
+; RV32I-NEXT: .LBB20_79:
+; RV32I-NEXT: bltu s2, t6, .LBB20_81
+; RV32I-NEXT: # %bb.80:
+; RV32I-NEXT: li s11, 0
+; RV32I-NEXT: li s8, 0
+; RV32I-NEXT: j .LBB20_82
+; RV32I-NEXT: .LBB20_81:
+; RV32I-NEXT: or s3, a6, s6
+; RV32I-NEXT: or s7, s1, s5
+; RV32I-NEXT: .LBB20_82:
+; RV32I-NEXT: addi ra, a5, -128
+; RV32I-NEXT: mv s4, t2
+; RV32I-NEXT: mv s6, a4
+; RV32I-NEXT: beqz s2, .LBB20_84
+; RV32I-NEXT: # %bb.83:
+; RV32I-NEXT: mv s4, s3
+; RV32I-NEXT: mv s6, s7
+; RV32I-NEXT: .LBB20_84:
+; RV32I-NEXT: neg s9, ra
+; RV32I-NEXT: sll s3, a4, s9
+; RV32I-NEXT: bltu ra, t5, .LBB20_86
+; RV32I-NEXT: # %bb.85:
+; RV32I-NEXT: sra a3, a4, ra
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: bnez ra, .LBB20_87
+; RV32I-NEXT: j .LBB20_88
+; RV32I-NEXT: .LBB20_86:
+; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: mv s1, t2
+; RV32I-NEXT: beqz ra, .LBB20_88
+; RV32I-NEXT: .LBB20_87:
+; RV32I-NEXT: mv s1, a3
+; RV32I-NEXT: .LBB20_88:
+; RV32I-NEXT: bltu ra, t5, .LBB20_90
+; RV32I-NEXT: # %bb.89:
+; RV32I-NEXT: srai s2, a4, 31
+; RV32I-NEXT: srl a3, a7, ra
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: bnez ra, .LBB20_91
+; RV32I-NEXT: j .LBB20_92
+; RV32I-NEXT: .LBB20_90:
+; RV32I-NEXT: sra s2, a4, a5
+; RV32I-NEXT: sll a3, a7, s9
+; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: mv a6, t1
+; RV32I-NEXT: beqz ra, .LBB20_92
+; RV32I-NEXT: .LBB20_91:
+; RV32I-NEXT: mv a6, a3
+; RV32I-NEXT: .LBB20_92:
+; RV32I-NEXT: mv s5, t0
+; RV32I-NEXT: sub s10, t6, ra
+; RV32I-NEXT: li t0, 64
+; RV32I-NEXT: bltu ra, t5, .LBB20_94
+; RV32I-NEXT: # %bb.93:
+; RV32I-NEXT: li s7, 0
+; RV32I-NEXT: j .LBB20_95
+; RV32I-NEXT: .LBB20_94:
+; RV32I-NEXT: srl s7, a7, a5
+; RV32I-NEXT: .LBB20_95:
+; RV32I-NEXT: mv t6, s8
+; RV32I-NEXT: mv s8, s11
+; RV32I-NEXT: bltu s10, t5, .LBB20_97
+; RV32I-NEXT: # %bb.96:
+; RV32I-NEXT: li s9, 0
+; RV32I-NEXT: sll a3, t2, s10
+; RV32I-NEXT: j .LBB20_98
+; RV32I-NEXT: .LBB20_97:
+; RV32I-NEXT: sll s9, t2, s9
+; RV32I-NEXT: neg a3, s10
+; RV32I-NEXT: srl a3, t2, a3
+; RV32I-NEXT: or a3, a3, s3
+; RV32I-NEXT: .LBB20_98:
+; RV32I-NEXT: addi s11, ra, -64
+; RV32I-NEXT: mv s3, a4
+; RV32I-NEXT: beqz s10, .LBB20_100
+; RV32I-NEXT: # %bb.99:
+; RV32I-NEXT: mv s3, a3
+; RV32I-NEXT: .LBB20_100:
+; RV32I-NEXT: bltu s11, t5, .LBB20_102
+; RV32I-NEXT: # %bb.101:
+; RV32I-NEXT: sra a3, a4, s11
+; RV32I-NEXT: bnez s11, .LBB20_103
+; RV32I-NEXT: j .LBB20_104
+; RV32I-NEXT: .LBB20_102:
+; RV32I-NEXT: srl a3, t2, ra
+; RV32I-NEXT: mv s10, s4
+; RV32I-NEXT: neg s4, s11
+; RV32I-NEXT: sll s4, a4, s4
+; RV32I-NEXT: or a3, a3, s4
+; RV32I-NEXT: mv s4, s10
+; RV32I-NEXT: beqz s11, .LBB20_104
+; RV32I-NEXT: .LBB20_103:
+; RV32I-NEXT: mv t2, a3
+; RV32I-NEXT: .LBB20_104:
+; RV32I-NEXT: bltu s11, t5, .LBB20_106
+; RV32I-NEXT: # %bb.105:
+; RV32I-NEXT: srai t5, a4, 31
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bltu ra, t0, .LBB20_107
+; RV32I-NEXT: j .LBB20_108
+; RV32I-NEXT: .LBB20_106:
+; RV32I-NEXT: sra t5, a4, ra
+; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: bgeu ra, t0, .LBB20_108
+; RV32I-NEXT: .LBB20_107:
+; RV32I-NEXT: or t2, a6, s9
+; RV32I-NEXT: or t5, s7, s3
+; RV32I-NEXT: .LBB20_108:
+; RV32I-NEXT: li a6, 128
+; RV32I-NEXT: bnez ra, .LBB20_117
+; RV32I-NEXT: # %bb.109:
+; RV32I-NEXT: bgeu ra, t0, .LBB20_118
+; RV32I-NEXT: .LBB20_110:
+; RV32I-NEXT: bgeu a5, a6, .LBB20_112
+; RV32I-NEXT: .LBB20_111:
+; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or t1, a3, s8
+; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a7, a3, t6
+; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s1, a3, s4
+; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or s2, a3, s6
+; RV32I-NEXT: .LBB20_112:
+; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: mv t0, s5
+; RV32I-NEXT: beqz a5, .LBB20_114
+; RV32I-NEXT: # %bb.113:
+; RV32I-NEXT: mv s0, t1
+; RV32I-NEXT: mv a0, a7
+; RV32I-NEXT: mv t4, s1
+; RV32I-NEXT: mv t3, s2
+; RV32I-NEXT: .LBB20_114:
+; RV32I-NEXT: bltu a5, a6, .LBB20_116
+; RV32I-NEXT: # %bb.115:
+; RV32I-NEXT: srai a1, a4, 31
+; RV32I-NEXT: mv t0, a1
+; RV32I-NEXT: mv s11, a1
+; RV32I-NEXT: mv ra, a1
+; RV32I-NEXT: .LBB20_116:
+; RV32I-NEXT: srli a4, s0, 16
+; RV32I-NEXT: lui t1, 16
+; RV32I-NEXT: srli a7, s0, 24
+; RV32I-NEXT: srli a5, a0, 16
+; RV32I-NEXT: srli t5, a0, 24
+; RV32I-NEXT: srli a6, t4, 16
+; RV32I-NEXT: srli s2, t4, 24
+; RV32I-NEXT: srli t2, t3, 16
+; RV32I-NEXT: srli s3, t3, 24
+; RV32I-NEXT: srli s1, a1, 16
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: srli t6, t0, 16
+; RV32I-NEXT: srli s6, t0, 24
+; RV32I-NEXT: srli s5, s11, 16
+; RV32I-NEXT: srli s4, s11, 24
+; RV32I-NEXT: srli s7, ra, 16
+; RV32I-NEXT: srli s8, ra, 24
+; RV32I-NEXT: addi t1, t1, -1
+; RV32I-NEXT: and s9, s0, t1
+; RV32I-NEXT: and s10, a0, t1
+; RV32I-NEXT: srli s9, s9, 8
+; RV32I-NEXT: sb s0, 0(a2)
+; RV32I-NEXT: sb s9, 1(a2)
+; RV32I-NEXT: sb a4, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: and a4, t4, t1
+; RV32I-NEXT: srli a7, s10, 8
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb a5, 6(a2)
+; RV32I-NEXT: sb t5, 7(a2)
+; RV32I-NEXT: and a0, t3, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb t4, 8(a2)
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: sb a6, 10(a2)
+; RV32I-NEXT: sb s2, 11(a2)
+; RV32I-NEXT: and a4, a1, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb t3, 12(a2)
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb s3, 15(a2)
+; RV32I-NEXT: and a0, t0, t1
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a1, 16(a2)
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: sb s1, 18(a2)
+; RV32I-NEXT: sb a3, 19(a2)
+; RV32I-NEXT: and a1, s11, t1
+; RV32I-NEXT: and a3, ra, t1
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a0, 21(a2)
+; RV32I-NEXT: sb t6, 22(a2)
+; RV32I-NEXT: sb s6, 23(a2)
+; RV32I-NEXT: sb s11, 24(a2)
+; RV32I-NEXT: sb a1, 25(a2)
+; RV32I-NEXT: sb s5, 26(a2)
+; RV32I-NEXT: sb s4, 27(a2)
+; RV32I-NEXT: sb ra, 28(a2)
+; RV32I-NEXT: sb a3, 29(a2)
+; RV32I-NEXT: sb s7, 30(a2)
+; RV32I-NEXT: sb s8, 31(a2)
+; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 96
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB20_117:
+; RV32I-NEXT: mv t1, t2
+; RV32I-NEXT: mv a7, t5
+; RV32I-NEXT: bltu ra, t0, .LBB20_110
+; RV32I-NEXT: .LBB20_118:
+; RV32I-NEXT: srai s1, a4, 31
+; RV32I-NEXT: mv s2, s1
+; RV32I-NEXT: bltu a5, a6, .LBB20_111
+; RV32I-NEXT: j .LBB20_112
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
>From 91ce4aa1022bcc83a6f607f4663798063cb0ab4f Mon Sep 17 00:00:00 2001
From: Luke Quinn <quic_lquinn at quicinc.com>
Date: Wed, 22 Jan 2025 08:30:40 -0800
Subject: [PATCH 2/3] [RISCV] Add GISelPredicateCode TD bindings and
hasAllNBitUsers prototypes for staging adding full support
Signed-off-by: Luke Quinn <quic_lquinn at quicinc.com>
---
.../RISCV/GISel/RISCVInstructionSelector.cpp | 18 ++++++++++++++++++
llvm/lib/Target/RISCV/RISCVInstrInfo.td | 8 ++++++--
2 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 6a42fdf3c35672..8dabda15a04a37 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -57,6 +57,20 @@ class RISCVInstructionSelector : public InstructionSelector {
const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) const;
+ static constexpr unsigned MaxRecursionDepth = 6;
+
+ bool hasAllNBitUsers(const MachineInstr &MI, unsigned Bits,
+ const unsigned Depth = 0) const;
+ bool hasAllBUsers(const MachineInstr &MI) const {
+ return hasAllNBitUsers(MI, 8);
+ }
+ bool hasAllHUsers(const MachineInstr &MI) const {
+ return hasAllNBitUsers(MI, 16);
+ }
+ bool hasAllWUsers(const MachineInstr &MI) const {
+ return hasAllNBitUsers(MI, 32);
+ }
+
bool isRegInGprb(Register Reg) const;
bool isRegInFprb(Register Reg) const;
@@ -184,6 +198,10 @@ RISCVInstructionSelector::RISCVInstructionSelector(
{
}
+bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth) const {
+ return false;
+};
+
InstructionSelector::ComplexRendererFns
RISCVInstructionSelector::selectShiftMask(MachineOperand &Root,
unsigned ShiftWidth) const {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index fec10864f95dc6..641112ee574726 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1949,7 +1949,9 @@ class binop_allhusers<SDPatternOperator operator>
: PatFrag<(ops node:$lhs, node:$rhs),
(XLenVT (operator node:$lhs, node:$rhs)), [{
return hasAllHUsers(Node);
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return hasAllHUsers(MI); }];
+}
// PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl
// if only the lower 32 bits of their result is used.
@@ -1957,7 +1959,9 @@ class binop_allwusers<SDPatternOperator operator>
: PatFrag<(ops node:$lhs, node:$rhs),
(i64 (operator node:$lhs, node:$rhs)), [{
return hasAllWUsers(Node);
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return hasAllWUsers(MI); }];
+}
def sexti32_allwusers : PatFrag<(ops node:$src),
(sext_inreg node:$src, i32), [{
>From 81ceb0064567bb52e11cd8714e98d0ba0a3288a4 Mon Sep 17 00:00:00 2001
From: Luke Quinn <quic_lquinn at quicinc.com>
Date: Tue, 4 Feb 2025 10:23:34 -0800
Subject: [PATCH 3/3] [RISCV] Add hasAllNBitUsers Functional change, change
allows for generation of packw instructions along with other generic
instructions with narrow w types. The Optimization pass was reduced from ISEL
for testing coverage
Signed-off-by: Luke Quinn <quic_lquinn at quicinc.com>
---
.../RISCV/GISel/RISCVInstructionSelector.cpp | 73 ++++++++++-
llvm/lib/Target/RISCV/RISCVInstrInfo.td | 6 +-
llvm/test/CodeGen/RISCV/GlobalISel/combine.ll | 2 +-
.../RISCV/GlobalISel/div-by-constant.ll | 6 +-
.../CodeGen/RISCV/GlobalISel/rotl-rotr.ll | 116 +++++++++---------
.../CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll | 16 +--
llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll | 76 ++++++------
.../test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll | 15 +--
llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll | 14 +--
...lar-shift-by-byte-multiple-legalization.ll | 66 +++++-----
10 files changed, 226 insertions(+), 164 deletions(-)
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 8dabda15a04a37..21921452e91119 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -198,9 +198,78 @@ RISCVInstructionSelector::RISCVInstructionSelector(
{
}
-bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth) const {
+// Mimics optimizations in ISel and RISCVOptWInst Pass
+bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI,
+ unsigned Bits,
+ const unsigned Depth) const {
+
+ assert((MI.getOpcode() == TargetOpcode::G_ADD ||
+ MI.getOpcode() == TargetOpcode::G_SUB ||
+ MI.getOpcode() == TargetOpcode::G_MUL ||
+ MI.getOpcode() == TargetOpcode::G_SHL ||
+ MI.getOpcode() == TargetOpcode::G_LSHR ||
+ MI.getOpcode() == TargetOpcode::G_AND ||
+ MI.getOpcode() == TargetOpcode::G_OR ||
+ MI.getOpcode() == TargetOpcode::G_XOR ||
+ MI.getOpcode() == TargetOpcode::G_SEXT_INREG || Depth != 0) &&
+ "Unexpected opcode");
+
+ if (Depth >= RISCVInstructionSelector::MaxRecursionDepth)
return false;
-};
+
+ auto DestReg = MI.getOperand(0).getReg();
+ for (auto &UserOp : MRI->use_nodbg_operands(DestReg)) {
+ assert(UserOp.getParent() && "UserOp must have a parent");
+ const MachineInstr &UserMI = *UserOp.getParent();
+ unsigned OpIdx = UserOp.getOperandNo();
+
+ switch (UserMI.getOpcode()) {
+ default:
+ return false;
+ case RISCV::ADDW:
+ case RISCV::ADDIW:
+ case RISCV::SUBW:
+ if (Bits >= 32)
+ break;
+ return false;
+ case RISCV::SLL:
+ case RISCV::SRA:
+ case RISCV::SRL:
+ // Shift amount operands only use log2(Xlen) bits.
+ if (OpIdx == 2 && Bits >= Log2_32(Subtarget->getXLen()))
+ break;
+ return false;
+ case RISCV::SLLI:
+ // SLLI only uses the lower (XLen - ShAmt) bits.
+ if (Bits >= Subtarget->getXLen() - UserMI.getOperand(2).getImm())
+ break;
+ return false;
+ case RISCV::ANDI:
+ if (Bits >= (unsigned)llvm::bit_width<uint64_t>(
+ (uint64_t)UserMI.getOperand(2).getImm()))
+ break;
+ goto RecCheck;
+ case RISCV::AND:
+ case RISCV::OR:
+ case RISCV::XOR:
+ RecCheck:
+ if (hasAllNBitUsers(UserMI, Bits, Depth + 1))
+ break;
+ return false;
+ case RISCV::SRLI: {
+ unsigned ShAmt = UserMI.getOperand(2).getImm();
+ // If we are shifting right by less than Bits, and users don't demand any
+ // bits that were shifted into [Bits-1:0], then we can consider this as an
+ // N-Bit user.
+ if (Bits > ShAmt && hasAllNBitUsers(UserMI, Bits - ShAmt, Depth + 1))
+ break;
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
InstructionSelector::ComplexRendererFns
RISCVInstructionSelector::selectShiftMask(MachineOperand &Root,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 641112ee574726..54fee1ac3130e1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1950,14 +1950,14 @@ class binop_allhusers<SDPatternOperator operator>
(XLenVT (operator node:$lhs, node:$rhs)), [{
return hasAllHUsers(Node);
}]> {
- let GISelPredicateCode = [{ return hasAllHUsers(MI); }];
+ let GISelPredicateCode = [{ return hasAllHUsers(MI); }];
}
// PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl
// if only the lower 32 bits of their result is used.
class binop_allwusers<SDPatternOperator operator>
- : PatFrag<(ops node:$lhs, node:$rhs),
- (i64 (operator node:$lhs, node:$rhs)), [{
+ : PatFrag<(ops node:$lhs, node:$rhs), (i64 (operator node:$lhs, node:$rhs)),
+ [{
return hasAllWUsers(Node);
}]> {
let GISelPredicateCode = [{ return hasAllWUsers(MI); }];
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
index 360e84d37ec858..61d1fa5a5b9f4b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
@@ -20,7 +20,7 @@ define i32 @constant_to_rhs(i32 %x) {
; RV64-O0: # %bb.0:
; RV64-O0-NEXT: mv a1, a0
; RV64-O0-NEXT: li a0, 1
-; RV64-O0-NEXT: add a0, a0, a1
+; RV64-O0-NEXT: addw a0, a0, a1
; RV64-O0-NEXT: sext.w a0, a0
; RV64-O0-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
index e3616a79add9f2..f62902cdd14d9e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -66,7 +66,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IM-NEXT: srli a2, a2, 32
; RV64IM-NEXT: mul a1, a2, a1
; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: sub a0, a0, a1
+; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: srliw a0, a0, 1
; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: srliw a0, a0, 2
@@ -79,7 +79,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IMZB-NEXT: zext.w a2, a0
; RV64IMZB-NEXT: mul a1, a2, a1
; RV64IMZB-NEXT: srli a1, a1, 32
-; RV64IMZB-NEXT: sub a0, a0, a1
+; RV64IMZB-NEXT: subw a0, a0, a1
; RV64IMZB-NEXT: srliw a0, a0, 1
; RV64IMZB-NEXT: add a0, a0, a1
; RV64IMZB-NEXT: srliw a0, a0, 2
@@ -265,7 +265,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV64-NEXT: andi a2, a0, 255
; RV64-NEXT: mul a1, a2, a1
; RV64-NEXT: srli a1, a1, 8
-; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: subw a0, a0, a1
; RV64-NEXT: andi a0, a0, 255
; RV64-NEXT: srli a0, a0, 1
; RV64-NEXT: add a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
index 46d1661983c6ac..8a786fc9993d2f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
@@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_32:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -55,7 +55,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_32:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: sllw a1, a0, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_32:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -104,7 +104,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_32:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: srlw a1, a0, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -167,7 +167,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotl_64:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -276,7 +276,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_64:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -340,7 +340,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotr_64:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -451,7 +451,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_64:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -474,7 +474,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_32_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -490,7 +490,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64ZBB-LABEL: rotl_32_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: negw a2, a1
; RV64ZBB-NEXT: sllw a1, a0, a1
; RV64ZBB-NEXT: srlw a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -506,7 +506,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_32_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: sllw a1, a0, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -531,7 +531,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64I-LABEL: rotl_32_mask_and_63_and_31:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a2, a0, a1
-; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: negw a1, a1
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -547,7 +547,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: sllw a2, a0, a1
-; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: negw a1, a1
; RV64ZBB-NEXT: srlw a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -563,7 +563,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a2, a0, a1
-; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: negw a1, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -632,7 +632,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_32_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -648,7 +648,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64ZBB-LABEL: rotr_32_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: negw a2, a1
; RV64ZBB-NEXT: srlw a1, a0, a1
; RV64ZBB-NEXT: sllw a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -664,7 +664,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_32_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: srlw a1, a0, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -689,7 +689,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64I-LABEL: rotr_32_mask_and_63_and_31:
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a2, a0, a1
-; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: negw a1, a1
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -705,7 +705,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: srlw a2, a0, a1
-; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: negw a1, a1
; RV64ZBB-NEXT: sllw a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -721,7 +721,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a2, a0, a1
-; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: negw a1, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -829,7 +829,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotl_64_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -884,7 +884,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64ZBB-LABEL: rotl_64_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: negw a2, a1
; RV64ZBB-NEXT: sll a1, a0, a1
; RV64ZBB-NEXT: srl a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -939,7 +939,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_64_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -1005,7 +1005,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64I-LABEL: rotl_64_mask_and_127_and_63:
; RV64I: # %bb.0:
; RV64I-NEXT: sll a2, a0, a1
-; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: negw a1, a1
; RV64I-NEXT: srl a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -1062,7 +1062,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: sll a2, a0, a1
-; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: negw a1, a1
; RV64ZBB-NEXT: srl a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -1119,7 +1119,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a2, a0, a1
-; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: negw a1, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1277,7 +1277,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotr_64_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -1331,7 +1331,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64ZBB-LABEL: rotr_64_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: neg a2, a1
+; RV64ZBB-NEXT: negw a2, a1
; RV64ZBB-NEXT: srl a1, a0, a1
; RV64ZBB-NEXT: sll a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -1385,7 +1385,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_64_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: neg a2, a1
+; RV64XTHEADBB-NEXT: negw a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -1451,7 +1451,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64I-LABEL: rotr_64_mask_and_127_and_63:
; RV64I: # %bb.0:
; RV64I-NEXT: srl a2, a0, a1
-; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: negw a1, a1
; RV64I-NEXT: sll a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -1508,7 +1508,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: srl a2, a0, a1
-; RV64ZBB-NEXT: neg a1, a1
+; RV64ZBB-NEXT: negw a1, a1
; RV64ZBB-NEXT: sll a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -1565,7 +1565,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a2, a0, a1
-; RV64XTHEADBB-NEXT: neg a1, a1
+; RV64XTHEADBB-NEXT: negw a1, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1701,7 +1701,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: sllw a4, a0, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: srlw a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sllw a1, a1, a2
@@ -1737,7 +1737,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: sllw a4, a0, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: srlw a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sllw a1, a1, a2
@@ -1822,7 +1822,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: sll a4, a0, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: srl a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sll a1, a1, a2
@@ -1972,7 +1972,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: sll a4, a0, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: srl a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sll a1, a1, a2
@@ -2002,7 +2002,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: srlw a4, a0, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: sllw a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sllw a1, a1, a2
@@ -2038,7 +2038,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: srlw a4, a0, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: sllw a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sllw a1, a1, a2
@@ -2125,7 +2125,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: srl a4, a0, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: sll a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sll a1, a1, a2
@@ -2279,7 +2279,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: srl a4, a0, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: sll a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sll a1, a1, a2
@@ -2312,8 +2312,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: sllw a4, a0, a2
; RV64I-NEXT: sllw a2, a1, a2
-; RV64I-NEXT: neg a5, a3
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: srlw a0, a0, a5
; RV64I-NEXT: srlw a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2353,8 +2353,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: sllw a4, a0, a2
; RV64XTHEADBB-NEXT: sllw a2, a1, a2
-; RV64XTHEADBB-NEXT: neg a5, a3
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a5, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: srlw a0, a0, a5
; RV64XTHEADBB-NEXT: srlw a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2464,7 +2464,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: sll a4, a0, a2
; RV64I-NEXT: sll a2, a1, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: srl a0, a0, a3
; RV64I-NEXT: srl a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2664,7 +2664,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: sll a4, a0, a2
; RV64XTHEADBB-NEXT: sll a2, a1, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: srl a0, a0, a3
; RV64XTHEADBB-NEXT: srl a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2697,8 +2697,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: srlw a4, a0, a2
; RV64I-NEXT: srlw a2, a1, a2
-; RV64I-NEXT: neg a5, a3
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: sllw a0, a0, a5
; RV64I-NEXT: sllw a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2738,8 +2738,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: srlw a4, a0, a2
; RV64XTHEADBB-NEXT: srlw a2, a1, a2
-; RV64XTHEADBB-NEXT: neg a5, a3
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a5, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: sllw a0, a0, a5
; RV64XTHEADBB-NEXT: sllw a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2850,7 +2850,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: srl a4, a0, a2
; RV64I-NEXT: srl a2, a1, a2
-; RV64I-NEXT: neg a3, a3
+; RV64I-NEXT: negw a3, a3
; RV64I-NEXT: sll a0, a0, a3
; RV64I-NEXT: sll a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -3052,7 +3052,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: srl a4, a0, a2
; RV64XTHEADBB-NEXT: srl a2, a1, a2
-; RV64XTHEADBB-NEXT: neg a3, a3
+; RV64XTHEADBB-NEXT: negw a3, a3
; RV64XTHEADBB-NEXT: sll a0, a0, a3
; RV64XTHEADBB-NEXT: sll a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -3116,7 +3116,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64I-LABEL: rotl_64_zext:
; RV64I: # %bb.0:
; RV64I-NEXT: li a2, 64
-; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: subw a2, a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -3171,7 +3171,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotl_64_zext:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a2, 64
-; RV64ZBB-NEXT: sub a2, a2, a1
+; RV64ZBB-NEXT: subw a2, a2, a1
; RV64ZBB-NEXT: sll a1, a0, a1
; RV64ZBB-NEXT: srl a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -3226,7 +3226,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_zext:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: li a2, 64
-; RV64XTHEADBB-NEXT: sub a2, a2, a1
+; RV64XTHEADBB-NEXT: subw a2, a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -3289,7 +3289,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64I-LABEL: rotr_64_zext:
; RV64I: # %bb.0:
; RV64I-NEXT: li a2, 64
-; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: subw a2, a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -3343,7 +3343,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotr_64_zext:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a2, 64
-; RV64ZBB-NEXT: sub a2, a2, a1
+; RV64ZBB-NEXT: subw a2, a2, a1
; RV64ZBB-NEXT: srl a1, a0, a1
; RV64ZBB-NEXT: sll a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -3397,7 +3397,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_zext:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: li a2, 64
-; RV64XTHEADBB-NEXT: sub a2, a2, a1
+; RV64XTHEADBB-NEXT: subw a2, a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
index a29219bfde06bb..79d08772e88532 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
@@ -107,7 +107,7 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: rol_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -125,7 +125,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
; RV64I-LABEL: rol_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a3, a1
+; RV64I-NEXT: negw a3, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a3
; RV64I-NEXT: or a0, a1, a0
@@ -146,7 +146,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
; RV64I-LABEL: rol_i32_neg_constant_rhs:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, -2
-; RV64I-NEXT: neg a2, a0
+; RV64I-NEXT: negw a2, a0
; RV64I-NEXT: sllw a0, a1, a0
; RV64I-NEXT: srlw a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
@@ -166,7 +166,7 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
define i64 @rol_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: rol_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -185,7 +185,7 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: ror_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -203,7 +203,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
; RV64I-LABEL: ror_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a3, a1
+; RV64I-NEXT: negw a3, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a3
; RV64I-NEXT: or a0, a1, a0
@@ -224,7 +224,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
; RV64I-LABEL: ror_i32_neg_constant_rhs:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, -2
-; RV64I-NEXT: neg a2, a0
+; RV64I-NEXT: negw a2, a0
; RV64I-NEXT: srlw a0, a1, a0
; RV64I-NEXT: sllw a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
@@ -244,7 +244,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64)
define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: ror_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 9df319e73a11a3..9a6c718703a27a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -31,13 +31,13 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -88,13 +88,13 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -103,7 +103,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srliw a0, a0, 24
; RV64I-NEXT: li a1, 32
-; RV64I-NEXT: sub a0, a1, a0
+; RV64I-NEXT: subw a0, a1, a0
; RV64I-NEXT: j .LBB1_3
; RV64I-NEXT: .LBB1_2:
; RV64I-NEXT: li a0, 32
@@ -153,13 +153,13 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -168,7 +168,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srliw a0, a0, 24
; RV64I-NEXT: li a1, 32
-; RV64I-NEXT: sub a1, a1, a0
+; RV64I-NEXT: subw a1, a1, a0
; RV64I-NEXT: .LBB2_2: # %cond.end
; RV64I-NEXT: subw a0, s0, a1
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -212,13 +212,13 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -283,13 +283,13 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -412,13 +412,13 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -455,13 +455,13 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -497,13 +497,13 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -553,13 +553,13 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -672,13 +672,13 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -709,13 +709,13 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
@@ -750,13 +750,13 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: sraiw a1, a0, 4
; RV64I-NEXT: addw a0, a1, a0
; RV64I-NEXT: lui a1, 4112
; RV64I-NEXT: addiw a2, a2, -241
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
index bf430c618afca2..558424b53be951 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
@@ -16,9 +16,7 @@ define signext i32 @pack_i32(i32 signext %a, i32 signext %b) nounwind {
;
; RV64ZBKB-LABEL: pack_i32:
; RV64ZBKB: # %bb.0:
-; RV64ZBKB-NEXT: zext.h a0, a0
-; RV64ZBKB-NEXT: slliw a1, a1, 16
-; RV64ZBKB-NEXT: or a0, a1, a0
+; RV64ZBKB-NEXT: packw a0, a0, a1
; RV64ZBKB-NEXT: ret
%shl = and i32 %a, 65535
%shl1 = shl i32 %b, 16
@@ -37,9 +35,7 @@ define signext i32 @pack_i32_2(i16 zeroext %a, i16 zeroext %b) nounwind {
;
; RV64ZBKB-LABEL: pack_i32_2:
; RV64ZBKB: # %bb.0:
-; RV64ZBKB-NEXT: slli a1, a1, 16
-; RV64ZBKB-NEXT: or a0, a1, a0
-; RV64ZBKB-NEXT: sext.w a0, a0
+; RV64ZBKB-NEXT: packw a0, a0, a1
; RV64ZBKB-NEXT: ret
%zexta = zext i16 %a to i32
%zextb = zext i16 %b to i32
@@ -60,8 +56,7 @@ define signext i32 @pack_i32_3(i16 zeroext %0, i16 zeroext %1, i32 signext %2) {
;
; RV64ZBKB-LABEL: pack_i32_3:
; RV64ZBKB: # %bb.0:
-; RV64ZBKB-NEXT: slli a0, a0, 16
-; RV64ZBKB-NEXT: or a0, a0, a1
+; RV64ZBKB-NEXT: packw a0, a1, a0
; RV64ZBKB-NEXT: addw a0, a0, a2
; RV64ZBKB-NEXT: ret
%4 = zext i16 %0 to i32
@@ -343,9 +338,7 @@ define signext i32 @pack_i32_allWUsers(i16 zeroext %0, i16 zeroext %1, i16 zeroe
; RV64ZBKB: # %bb.0:
; RV64ZBKB-NEXT: add a0, a1, a0
; RV64ZBKB-NEXT: zext.h a0, a0
-; RV64ZBKB-NEXT: slli a0, a0, 16
-; RV64ZBKB-NEXT: or a0, a0, a2
-; RV64ZBKB-NEXT: sext.w a0, a0
+; RV64ZBKB-NEXT: packw a0, a2, a0
; RV64ZBKB-NEXT: ret
%4 = add i16 %1, %0
%5 = zext i16 %4 to i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
index 71a5ecc77a1b08..8b262db56ccd27 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
@@ -330,7 +330,7 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: li a3, 64
; RV64I-NEXT: bltu a2, a3, .LBB6_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a4, a2, a3
+; RV64I-NEXT: subw a4, a2, a3
; RV64I-NEXT: srl a4, a1, a4
; RV64I-NEXT: bnez a2, .LBB6_3
; RV64I-NEXT: j .LBB6_4
@@ -476,7 +476,7 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: li a3, 64
; RV64I-NEXT: bltu a2, a3, .LBB7_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a4, a2, a3
+; RV64I-NEXT: subw a4, a2, a3
; RV64I-NEXT: sra a4, a1, a4
; RV64I-NEXT: bnez a2, .LBB7_3
; RV64I-NEXT: j .LBB7_4
@@ -615,7 +615,7 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: bltu a2, a4, .LBB8_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a0, 0
-; RV64I-NEXT: sub a4, a2, a4
+; RV64I-NEXT: subw a4, a2, a4
; RV64I-NEXT: sll a3, a3, a4
; RV64I-NEXT: bnez a2, .LBB8_3
; RV64I-NEXT: j .LBB8_4
@@ -685,7 +685,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
;
; RV64I-LABEL: fshr64_minsize:
; RV64I: # %bb.0:
-; RV64I-NEXT: neg a2, a1
+; RV64I-NEXT: negw a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -914,7 +914,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: li a4, 64
; RV64I-NEXT: bltu a5, a4, .LBB10_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a3, a5, a4
+; RV64I-NEXT: subw a3, a5, a4
; RV64I-NEXT: srl a6, a1, a3
; RV64I-NEXT: j .LBB10_3
; RV64I-NEXT: .LBB10_2:
@@ -928,7 +928,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: # %bb.4:
; RV64I-NEXT: mv a3, a6
; RV64I-NEXT: .LBB10_5:
-; RV64I-NEXT: neg a7, a2
+; RV64I-NEXT: negw a7, a2
; RV64I-NEXT: bltu a5, a4, .LBB10_7
; RV64I-NEXT: # %bb.6:
; RV64I-NEXT: li a2, 0
@@ -940,7 +940,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: bltu a6, a4, .LBB10_10
; RV64I-NEXT: # %bb.9:
; RV64I-NEXT: li a5, 0
-; RV64I-NEXT: sub a4, a6, a4
+; RV64I-NEXT: subw a4, a6, a4
; RV64I-NEXT: sll a0, a0, a4
; RV64I-NEXT: bnez a6, .LBB10_11
; RV64I-NEXT: j .LBB10_12
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
index 4ede6932428989..bc002fee4417c5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -758,7 +758,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB6_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: subw a5, a1, a4
; RV64I-NEXT: srl a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB6_3
; RV64I-NEXT: j .LBB6_4
@@ -1091,7 +1091,7 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB7_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: subw a5, a1, a4
; RV64I-NEXT: srl a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB7_3
; RV64I-NEXT: j .LBB7_4
@@ -1425,7 +1425,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu a3, a5, .LBB8_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a1, 0
-; RV64I-NEXT: sub a5, a3, a5
+; RV64I-NEXT: subw a5, a3, a5
; RV64I-NEXT: sll a4, a4, a5
; RV64I-NEXT: bnez a3, .LBB8_3
; RV64I-NEXT: j .LBB8_4
@@ -1754,7 +1754,7 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: bltu a3, a5, .LBB9_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a1, 0
-; RV64I-NEXT: sub a5, a3, a5
+; RV64I-NEXT: subw a5, a3, a5
; RV64I-NEXT: sll a4, a4, a5
; RV64I-NEXT: bnez a3, .LBB9_3
; RV64I-NEXT: j .LBB9_4
@@ -2083,7 +2083,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB10_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: subw a5, a1, a4
; RV64I-NEXT: sra a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB10_3
; RV64I-NEXT: j .LBB10_4
@@ -2416,7 +2416,7 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB11_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: sub a5, a1, a4
+; RV64I-NEXT: subw a5, a1, a4
; RV64I-NEXT: sra a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB11_3
; RV64I-NEXT: j .LBB11_4
@@ -2796,7 +2796,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 3
-; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: subw t1, a5, a7
; RV64I-NEXT: negw t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB12_2
@@ -2851,7 +2851,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu t6, a7, .LBB12_12
; RV64I-NEXT: .LBB12_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: subw t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB12_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -2862,7 +2862,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB12_17:
; RV64I-NEXT: bltu s0, a7, .LBB12_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: subw t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB12_20
; RV64I-NEXT: j .LBB12_21
@@ -3720,7 +3720,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 5
-; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: subw t1, a5, a7
; RV64I-NEXT: negw t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB13_2
@@ -3775,7 +3775,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: bltu t6, a7, .LBB13_12
; RV64I-NEXT: .LBB13_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: subw t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB13_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -3786,7 +3786,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: .LBB13_17:
; RV64I-NEXT: bltu s0, a7, .LBB13_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: subw t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB13_20
; RV64I-NEXT: j .LBB13_21
@@ -4644,7 +4644,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 6
-; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: subw t1, a5, a7
; RV64I-NEXT: negw t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB14_2
@@ -4699,7 +4699,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: bltu t6, a7, .LBB14_12
; RV64I-NEXT: .LBB14_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t3, t6, a7
+; RV64I-NEXT: subw t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB14_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -4710,7 +4710,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: .LBB14_17:
; RV64I-NEXT: bltu s0, a7, .LBB14_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, a7
+; RV64I-NEXT: subw t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB14_20
; RV64I-NEXT: j .LBB14_21
@@ -5542,7 +5542,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 3
-; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: subw t2, a6, t0
; RV64I-NEXT: negw t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB15_2
@@ -5585,7 +5585,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB15_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: subw s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB15_8
; RV64I-NEXT: .LBB15_7:
@@ -5637,7 +5637,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu s0, t0, .LBB15_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: subw t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB15_21
; RV64I-NEXT: j .LBB15_22
@@ -6456,7 +6456,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 5
-; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: subw t2, a6, t0
; RV64I-NEXT: negw t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB16_2
@@ -6499,7 +6499,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB16_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: subw s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB16_8
; RV64I-NEXT: .LBB16_7:
@@ -6551,7 +6551,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: bltu s0, t0, .LBB16_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: subw t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB16_21
; RV64I-NEXT: j .LBB16_22
@@ -7370,7 +7370,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 6
-; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: subw t2, a6, t0
; RV64I-NEXT: negw t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB17_2
@@ -7413,7 +7413,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB17_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: sub s0, a4, t0
+; RV64I-NEXT: subw s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB17_8
; RV64I-NEXT: .LBB17_7:
@@ -7465,7 +7465,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: bltu s0, t0, .LBB17_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: sub t0, s0, t0
+; RV64I-NEXT: subw t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB17_21
; RV64I-NEXT: j .LBB17_22
@@ -8310,7 +8310,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 3
-; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: subw t1, a6, t0
; RV64I-NEXT: negw t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB18_2
@@ -8365,7 +8365,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu t6, t0, .LBB18_12
; RV64I-NEXT: .LBB18_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: subw t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB18_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -8376,7 +8376,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB18_17:
; RV64I-NEXT: bltu s0, t0, .LBB18_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: subw t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB18_20
; RV64I-NEXT: j .LBB18_21
@@ -9241,7 +9241,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 5
-; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: subw t1, a6, t0
; RV64I-NEXT: negw t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB19_2
@@ -9296,7 +9296,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: bltu t6, t0, .LBB19_12
; RV64I-NEXT: .LBB19_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: subw t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB19_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -9307,7 +9307,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: .LBB19_17:
; RV64I-NEXT: bltu s0, t0, .LBB19_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: subw t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB19_20
; RV64I-NEXT: j .LBB19_21
@@ -10172,7 +10172,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 6
-; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: subw t1, a6, t0
; RV64I-NEXT: negw t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB20_2
@@ -10227,7 +10227,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: bltu t6, t0, .LBB20_12
; RV64I-NEXT: .LBB20_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: sub t4, t6, t0
+; RV64I-NEXT: subw t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB20_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -10238,7 +10238,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: .LBB20_17:
; RV64I-NEXT: bltu s0, t0, .LBB20_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: sub t6, s0, t0
+; RV64I-NEXT: subw t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB20_20
; RV64I-NEXT: j .LBB20_21
More information about the llvm-commits
mailing list