[llvm] f27017a - [LoongArch] Align functions and loops better according to uarch

Tue Jul 18 20:26:32 PDT 2023

Author: WANG Xuerui
Date: 2023-07-19T11:25:57+08:00
New Revision: f27017a06313455fa346f0aae95a27100202c728

URL: https://github.com/llvm/llvm-project/commit/f27017a06313455fa346f0aae95a27100202c728
DIFF: https://github.com/llvm/llvm-project/commit/f27017a06313455fa346f0aae95a27100202c728.diff

LOG: [LoongArch] Align functions and loops better according to uarch

The LA464 micro-architecture is very sensitive to alignment of hot code,
with performance variation of up to ~12% in the go1 benchmark suite of
the Go language (as observed by me during my work on the Go loong64
port).
[[ https://go.dev/cl/479816 | Manual alignment of certain loops ]] and [[ https://go.dev/cl/479817 | automatic alignment of loop heads ]]
helps a lot there, by reducing much of the random variation and
generally increasing performance, so we naturally want to do the same
here.

Practically, LA464 is the only LoongArch micro-architecture in wide use,
and we are currently supporting just that. The first "4" in "LA464"
stands for "4-issue", in particular its instruction fetch and decode
stages are 4-wide; so functions and branch targets should be preferably
aligned to at least 16 bytes for best throughput.

The Loongson team has benchmarked various combinations of function,
loop, and branch target alignments with GCC.
[[ https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619980.html | The results ]]
show that "16-byte label alignment together with 32-byte function
alignment gives best results in terms of SPEC score". A "label" in GCC
means a branch target; while we don't currently align branch targets,
we do align loops, so in this patch we default to 32-byte function
alignment and 16-byte loop alignment.

Reviewed By: SixWeining

Differential Revision: https://reviews.llvm.org/D148622

Added: 
    llvm/test/CodeGen/LoongArch/preferred-alignments.ll

Modified: 
    llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
    llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
    llvm/lib/Target/LoongArch/LoongArchSubtarget.h
    llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
    llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
    llvm/test/CodeGen/LoongArch/ir-instruction/br.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 62ce24d942fa63..4977b8e6f9878a 100644

--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -199,6 +199,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
   // Function alignments.
   setMinFunctionAlignment(Align(4));
+  // Set preferred alignments.
+  setPrefFunctionAlignment(Subtarget.getPrefFunctionAlignment());
+  setPrefLoopAlignment(Subtarget.getPrefLoopAlignment());
+  setMaxBytesForAlignment(Subtarget.getMaxBytesForAlignment());
 
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);

diff  --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
index a0136440ec0ee5..ffcde7dd1fa741 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
@@ -35,6 +35,7 @@ LoongArchSubtarget &LoongArchSubtarget::initializeSubtargetDependencies(
     TuneCPU = CPU;
 
   ParseSubtargetFeatures(CPU, TuneCPU, FS);
+  initializeProperties(TuneCPU);
   if (Is64Bit) {
     GRLenVT = MVT::i64;
     GRLen = 64;
@@ -54,6 +55,32 @@ LoongArchSubtarget &LoongArchSubtarget::initializeSubtargetDependencies(
   return *this;
 }
 
+void LoongArchSubtarget::initializeProperties(StringRef TuneCPU) {
+  // Initialize CPU specific properties. We should add a tablegen feature for
+  // this in the future so we can specify it together with the subtarget
+  // features.
+
+  // TODO: Check TuneCPU and override defaults (that are for LA464) once we
+  // support optimizing for more uarchs.
+
+  // Default to the alignment settings empirically confirmed to perform best
+  // on LA464, with 4-wide instruction fetch and decode stages. These settings
+  // can also be overridden in initializeProperties.
+  //
+  // We default to such higher-than-minimum alignments because we assume that:
+  //
+  // * these settings should benefit most existing uarchs/users,
+  // * future general-purpose LoongArch cores are likely to have issue widths
+  //   equal to or wider than 4,
+  // * instruction sequences best for LA464 should not pessimize other future
+  //   uarchs, and
+  // * narrower cores would not suffer much (aside from slightly increased
+  //   ICache footprint maybe), compared to the gains everywhere else.
+  PrefFunctionAlignment = Align(32);
+  PrefLoopAlignment = Align(16);
+  MaxBytesForAlignment = 16;
+}
+
 LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU,
                                        StringRef TuneCPU, StringRef FS,
                                        StringRef ABIName,

diff  --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index 6ff562acf11b32..0fbe23f2f62d9d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -52,6 +52,10 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   LoongArchTargetLowering TLInfo;
   SelectionDAGTargetInfo TSInfo;
 
+  Align PrefFunctionAlignment;
+  Align PrefLoopAlignment;
+  unsigned MaxBytesForAlignment;
+
   /// Initializes using the passed in CPU and feature strings so that we can
   /// use initializer lists for subtarget initialization.
   LoongArchSubtarget &initializeSubtargetDependencies(const Triple &TT,
@@ -60,6 +64,9 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
                                                       StringRef FS,
                                                       StringRef ABIName);
 
+  /// Initialize properties based on the selected processor family.
+  void initializeProperties(StringRef TuneCPU);
+
 public:
   // Initializes the data members to match that of the specified triple.
   LoongArchSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
@@ -97,6 +104,9 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   unsigned getGRLen() const { return GRLen; }
   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
   bool isXRaySupported() const override { return is64Bit(); }
+  Align getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
+  Align getPrefLoopAlignment() const { return PrefLoopAlignment; }
+  unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; }
 };
 } // end namespace llvm
 

diff  --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
index 67823343d1c809..ad264ac53fa250 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
@@ -13,6 +13,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    andi $a0, $a0, 24
 ; LA64-NEXT:    nor $a4, $a4, $zero
 ; LA64-NEXT:    andi $a1, $a1, 255
+; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB0_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB0_3 Depth 2
@@ -66,6 +67,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    andi $a0, $a0, 24
 ; LA64-NEXT:    nor $a4, $a4, $zero
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB1_3 Depth 2
@@ -111,6 +113,7 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    ld.w $a3, $a0, 0
 ; LA64-NEXT:    bstrpick.d $a2, $a1, 31, 0
+; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB2_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB2_3 Depth 2
@@ -150,6 +153,7 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; LA64-LABEL: atomicrmw_uinc_wrap_i64:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    ld.d $a2, $a0, 0
+; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB3_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB3_3 Depth 2
@@ -195,6 +199,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    andi $a0, $a0, 24
 ; LA64-NEXT:    nor $a4, $a4, $zero
 ; LA64-NEXT:    andi $a5, $a1, 255
+; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB4_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB4_3 Depth 2
@@ -253,6 +258,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    andi $a0, $a0, 24
 ; LA64-NEXT:    nor $a4, $a4, $zero
 ; LA64-NEXT:    bstrpick.d $a5, $a1, 15, 0
+; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB5_3 Depth 2
@@ -303,6 +309,7 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    ld.w $a4, $a0, 0
 ; LA64-NEXT:    bstrpick.d $a3, $a1, 31, 0
+; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB6_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB6_3 Depth 2
@@ -347,6 +354,7 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; LA64-LABEL: atomicrmw_udec_wrap_i64:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    ld.d $a2, $a0, 0
+; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB7_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB7_3 Depth 2

diff  --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index 55ca08b5ed0e12..9767717395b67e 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -11,6 +11,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    addi.w $a1, $zero, 1
 ; LA64F-NEXT:    movgr2fr.w $fa1, $a1
 ; LA64F-NEXT:    ffint.s.w $fa1, $fa1
+; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB0_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
 ; LA64F-NEXT:    # Child Loop BB0_3 Depth 2
@@ -46,6 +47,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    addi.w $a1, $zero, 1
 ; LA64D-NEXT:    movgr2fr.w $fa1, $a1
 ; LA64D-NEXT:    ffint.s.w $fa1, $fa1
+; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB0_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
 ; LA64D-NEXT:    # Child Loop BB0_3 Depth 2
@@ -85,6 +87,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
 ; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI1_0)
 ; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
 ; LA64F-NEXT:    # Child Loop BB1_3 Depth 2
@@ -120,6 +123,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
 ; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI1_0)
 ; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
 ; LA64D-NEXT:    # Child Loop BB1_3 Depth 2
@@ -159,6 +163,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    addi.w $a1, $zero, 1
 ; LA64F-NEXT:    movgr2fr.w $fa1, $a1
 ; LA64F-NEXT:    ffint.s.w $fa1, $fa1
+; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB2_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
 ; LA64F-NEXT:    # Child Loop BB2_3 Depth 2
@@ -195,6 +200,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    addi.w $a1, $zero, 1
 ; LA64D-NEXT:    movgr2fr.w $fa1, $a1
 ; LA64D-NEXT:    ffint.s.w $fa1, $fa1
+; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB2_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
 ; LA64D-NEXT:    # Child Loop BB2_3 Depth 2
@@ -235,6 +241,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    addi.w $a1, $zero, 1
 ; LA64F-NEXT:    movgr2fr.w $fa1, $a1
 ; LA64F-NEXT:    ffint.s.w $fa1, $fa1
+; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB3_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
 ; LA64F-NEXT:    # Child Loop BB3_3 Depth 2
@@ -271,6 +278,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    addi.w $a1, $zero, 1
 ; LA64D-NEXT:    movgr2fr.w $fa1, $a1
 ; LA64D-NEXT:    ffint.s.w $fa1, $fa1
+; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB3_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
 ; LA64D-NEXT:    # Child Loop BB3_3 Depth 2
@@ -322,6 +330,7 @@ define double @double_fadd_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    addi.d $s2, $sp, 16
 ; LA64F-NEXT:    addi.d $s3, $sp, 8
 ; LA64F-NEXT:    ori $s4, $zero, 2
+; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB4_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64F-NEXT:    st.d $a0, $sp, 16
@@ -368,6 +377,7 @@ define double @double_fadd_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    addi.d $s1, $sp, 16
 ; LA64D-NEXT:    addi.d $s2, $sp, 8
 ; LA64D-NEXT:    ori $s3, $zero, 2
+; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB4_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64D-NEXT:    fst.d $fa0, $sp, 16
@@ -414,6 +424,7 @@ define double @double_fsub_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    addi.d $s2, $sp, 16
 ; LA64F-NEXT:    addi.d $s3, $sp, 8
 ; LA64F-NEXT:    ori $s4, $zero, 2
+; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64F-NEXT:    st.d $a0, $sp, 16
@@ -460,6 +471,7 @@ define double @double_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    addi.d $s1, $sp, 16
 ; LA64D-NEXT:    addi.d $s2, $sp, 8
 ; LA64D-NEXT:    ori $s3, $zero, 2
+; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64D-NEXT:    fst.d $fa0, $sp, 16
@@ -506,6 +518,7 @@ define double @double_fmin_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    addi.d $s2, $sp, 16
 ; LA64F-NEXT:    addi.d $s3, $sp, 8
 ; LA64F-NEXT:    ori $s4, $zero, 2
+; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB6_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64F-NEXT:    st.d $a0, $sp, 16
@@ -552,6 +565,7 @@ define double @double_fmin_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    addi.d $s1, $sp, 16
 ; LA64D-NEXT:    addi.d $s2, $sp, 8
 ; LA64D-NEXT:    ori $s3, $zero, 2
+; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB6_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64D-NEXT:    fst.d $fa0, $sp, 16
@@ -599,6 +613,7 @@ define double @double_fmax_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    addi.d $s2, $sp, 16
 ; LA64F-NEXT:    addi.d $s3, $sp, 8
 ; LA64F-NEXT:    ori $s4, $zero, 2
+; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB7_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64F-NEXT:    st.d $a0, $sp, 16
@@ -645,6 +660,7 @@ define double @double_fmax_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    addi.d $s1, $sp, 16
 ; LA64D-NEXT:    addi.d $s2, $sp, 8
 ; LA64D-NEXT:    ori $s3, $zero, 2
+; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB7_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64D-NEXT:    fst.d $fa0, $sp, 16

diff  --git a/llvm/test/CodeGen/LoongArch/ir-instruction/br.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/br.ll
index 7ab4788e6a31a1..91cf72bb2a76e4 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/br.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/br.ll
@@ -5,6 +5,7 @@
 define void @foo() noreturn nounwind {
 ; ALL-LABEL: foo:
 ; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    .p2align 4, , 16
 ; ALL-NEXT:  .LBB0_1: # %loop
 ; ALL-NEXT:    # =>This Inner Loop Header: Depth=1
 ; ALL-NEXT:    b .LBB0_1

diff  --git a/llvm/test/CodeGen/LoongArch/preferred-alignments.ll b/llvm/test/CodeGen/LoongArch/preferred-alignments.ll
new file mode 100644
index 00000000000000..6dac525874ec3a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/preferred-alignments.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 < %s | FileCheck --check-prefix=LA464 %s
+; RUN: llc --mtriple=loongarch64 --mcpu=la464 < %s | FileCheck --check-prefix=LA464 %s
+
+define signext i32 @sum(ptr noalias nocapture noundef readonly %0, i32 noundef signext %1) {
+; LA464-LABEL: sum:
+; LA464:       # %bb.0:
+; LA464-NEXT:    ori $a2, $zero, 1
+; LA464-NEXT:    blt $a1, $a2, .LBB0_4
+; LA464-NEXT:  # %bb.1:
+; LA464-NEXT:    bstrpick.d $a2, $a1, 31, 0
+; LA464-NEXT:    move $a1, $zero
+; LA464-NEXT:    .p2align 4, , 16
+; LA464-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
+; LA464-NEXT:    ld.w $a3, $a0, 0
+; LA464-NEXT:    add.d $a1, $a3, $a1
+; LA464-NEXT:    addi.d $a0, $a0, 4
+; LA464-NEXT:    addi.d $a2, $a2, -1
+; LA464-NEXT:    bnez $a2, .LBB0_2
+; LA464-NEXT:  # %bb.3:
+; LA464-NEXT:    addi.w $a0, $a1, 0
+; LA464-NEXT:    ret
+; LA464-NEXT:  .LBB0_4:
+; LA464-NEXT:    move $a1, $zero
+; LA464-NEXT:    addi.w $a0, $a1, 0
+; LA464-NEXT:    ret
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %4, label %6
+
+4:                                                ; preds = %2
+  %5 = zext i32 %1 to i64
+  br label %8
+
+6:                                                ; preds = %8, %2
+  %7 = phi i32 [ 0, %2 ], [ %13, %8 ]
+  ret i32 %7
+
+8:                                                ; preds = %4, %8
+  %9 = phi i64 [ 0, %4 ], [ %14, %8 ]
+  %10 = phi i32 [ 0, %4 ], [ %13, %8 ]
+  %11 = getelementptr inbounds i32, ptr %0, i64 %9
+  %12 = load i32, ptr %11, align 4
+  %13 = add nsw i32 %12, %10
+  %14 = add nuw nsw i64 %9, 1
+  %15 = icmp eq i64 %14, %5
+  br i1 %15, label %6, label %8
+}