[llvm] 66749ce - [ARM] Add Thumb LSR codegen tests. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 16 06:24:57 PST 2023


Author: David Green
Date: 2023-02-16T14:24:51Z
New Revision: 66749ce92707b578a17b36cb479ae44cdc314640

URL: https://github.com/llvm/llvm-project/commit/66749ce92707b578a17b36cb479ae44cdc314640
DIFF: https://github.com/llvm/llvm-project/commit/66749ce92707b578a17b36cb479ae44cdc314640.diff

LOG: [ARM] Add Thumb LSR codegen tests. NFC

This is the same routine generated in two different ways that ends up with
different orders to loads. The first currently does better than the second
with ordered loads, but needn't if the filtering in LSR is improved.

Added: 
    llvm/test/CodeGen/Thumb/arm_q15_to_q31.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/Thumb/arm_q15_to_q31.ll b/llvm/test/CodeGen/Thumb/arm_q15_to_q31.ll
new file mode 100644
index 0000000000000..9feb7cc282528
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/arm_q15_to_q31.ll
@@ -0,0 +1,849 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc < %s -mtriple=thumbv8m.base-arm-none-eabi < %s | FileCheck %s
+
+define void @arm_q15_to_q31(ptr nocapture noundef readonly %pSrc, ptr nocapture noundef writeonly %pDst, i32 noundef %blockSize) {
+; CHECK-LABEL: arm_q15_to_q31:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    mov r7, r2
+; CHECK-NEXT:    lsrs r3, r2, #2
+; CHECK-NEXT:    beq .LBB0_6
+; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
+; CHECK-NEXT:    movs r5, #3
+; CHECK-NEXT:    ands r5, r3
+; CHECK-NEXT:    subs r2, r3, #1
+; CHECK-NEXT:    cbz r5, .LBB0_4
+; CHECK-NEXT:  @ %bb.2: @ %while.body.prol
+; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
+; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldrh r2, [r0]
+; CHECK-NEXT:    ldrh r7, [r0, #2]
+; CHECK-NEXT:    ldrh r4, [r0, #4]
+; CHECK-NEXT:    ldrh r6, [r0, #6]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    lsls r7, r7, #16
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    stm r1!, {r2, r7}
+; CHECK-NEXT:    str r4, [r1]
+; CHECK-NEXT:    str r6, [r1, #4]
+; CHECK-NEXT:    subs r1, #8
+; CHECK-NEXT:    cmp r5, #1
+; CHECK-NEXT:    bne .LBB0_11
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:    adds r1, #16
+; CHECK-NEXT:    adds r0, #8
+; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:  .LBB0_4: @ %while.body.prol.loopexit
+; CHECK-NEXT:    cmp r2, #3
+; CHECK-NEXT:    blo .LBB0_6
+; CHECK-NEXT:  .LBB0_5: @ %while.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh r2, [r0]
+; CHECK-NEXT:    ldrh r4, [r0, #2]
+; CHECK-NEXT:    ldrh r5, [r0, #4]
+; CHECK-NEXT:    ldrh r6, [r0, #6]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    str r6, [r1, #12]
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    str r5, [r1, #8]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    str r4, [r1, #4]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1]
+; CHECK-NEXT:    ldrh r2, [r0, #8]
+; CHECK-NEXT:    ldrh r4, [r0, #10]
+; CHECK-NEXT:    ldrh r5, [r0, #12]
+; CHECK-NEXT:    ldrh r6, [r0, #14]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    str r6, [r1, #28]
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    str r5, [r1, #24]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    str r4, [r1, #20]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #16]
+; CHECK-NEXT:    ldrh r2, [r0, #16]
+; CHECK-NEXT:    ldrh r4, [r0, #18]
+; CHECK-NEXT:    ldrh r5, [r0, #20]
+; CHECK-NEXT:    ldrh r6, [r0, #22]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    str r6, [r1, #44]
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    str r5, [r1, #40]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    str r4, [r1, #36]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #32]
+; CHECK-NEXT:    ldrh r2, [r0, #24]
+; CHECK-NEXT:    ldrh r4, [r0, #26]
+; CHECK-NEXT:    ldrh r5, [r0, #28]
+; CHECK-NEXT:    ldrh r6, [r0, #30]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    str r6, [r1, #60]
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    str r5, [r1, #56]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    str r4, [r1, #52]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #48]
+; CHECK-NEXT:    adds r1, #64
+; CHECK-NEXT:    adds r0, #32
+; CHECK-NEXT:    subs r3, r3, #4
+; CHECK-NEXT:    bne .LBB0_5
+; CHECK-NEXT:  .LBB0_6: @ %while.end
+; CHECK-NEXT:    movs r2, #3
+; CHECK-NEXT:    ands r7, r2
+; CHECK-NEXT:    beq .LBB0_10
+; CHECK-NEXT:  @ %bb.7: @ %while.body12
+; CHECK-NEXT:    ldrh r2, [r0]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1]
+; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    beq .LBB0_10
+; CHECK-NEXT:  @ %bb.8: @ %while.body12.1
+; CHECK-NEXT:    ldrh r2, [r0, #2]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #4]
+; CHECK-NEXT:    cmp r7, #2
+; CHECK-NEXT:    beq .LBB0_10
+; CHECK-NEXT:  @ %bb.9: @ %while.body12.2
+; CHECK-NEXT:    ldrh r0, [r0, #4]
+; CHECK-NEXT:    lsls r0, r0, #16
+; CHECK-NEXT:    str r0, [r1, #8]
+; CHECK-NEXT:  .LBB0_10: @ %while.end17
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:  .LBB0_11: @ %while.body.prol.1
+; CHECK-NEXT:    ldrh r2, [r0, #8]
+; CHECK-NEXT:    ldrh r4, [r0, #10]
+; CHECK-NEXT:    ldrh r6, [r0, #12]
+; CHECK-NEXT:    ldrh r7, [r0, #14]
+; CHECK-NEXT:    lsls r7, r7, #16
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #16]
+; CHECK-NEXT:    str r4, [r1, #20]
+; CHECK-NEXT:    str r6, [r1, #24]
+; CHECK-NEXT:    str r7, [r1, #28]
+; CHECK-NEXT:    cmp r5, #2
+; CHECK-NEXT:    bne .LBB0_13
+; CHECK-NEXT:  @ %bb.12:
+; CHECK-NEXT:    subs r3, r3, #2
+; CHECK-NEXT:    adds r1, #32
+; CHECK-NEXT:    adds r0, #16
+; CHECK-NEXT:    b .LBB0_14
+; CHECK-NEXT:  .LBB0_13: @ %while.body.prol.2
+; CHECK-NEXT:    ldrh r2, [r0, #16]
+; CHECK-NEXT:    ldrh r4, [r0, #18]
+; CHECK-NEXT:    ldrh r5, [r0, #20]
+; CHECK-NEXT:    ldrh r6, [r0, #22]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    adds r7, #32
+; CHECK-NEXT:    stm r7!, {r2, r4, r5, r6}
+; CHECK-NEXT:    subs r3, r3, #3
+; CHECK-NEXT:    adds r1, #48
+; CHECK-NEXT:    adds r0, #24
+; CHECK-NEXT:  .LBB0_14: @ %while.body.prol.loopexit
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    cmp r2, #3
+; CHECK-NEXT:    bhs .LBB0_5
+; CHECK-NEXT:    b .LBB0_6
+entry:
+  %cmp.not19 = icmp ult i32 %blockSize, 4
+  br i1 %cmp.not19, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  %shr = lshr i32 %blockSize, 2
+  %0 = add nsw i32 %shr, -1
+  %xtraiter = and i32 %shr, 3
+  %lcmp.mod.not = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod.not, label %while.body.prol.loopexit, label %while.body.prol
+
+while.body.prol:                                  ; preds = %while.body.preheader
+  %pIn.0.val.prol = load i16, ptr %pSrc, align 2
+  %1 = getelementptr i8, ptr %pSrc, i32 2
+  %pIn.0.val13.prol = load i16, ptr %1, align 2
+  %conv.i.prol = sext i16 %pIn.0.val13.prol to i32
+  %shl.i.prol = shl nsw i32 %conv.i.prol, 16
+  %conv22.i.prol = zext i16 %pIn.0.val.prol to i32
+  %add.ptr2.prol = getelementptr inbounds i16, ptr %pSrc, i32 4
+  %add.ptr3.prol = getelementptr inbounds i16, ptr %pSrc, i32 2
+  %add.ptr3.val.prol = load i16, ptr %add.ptr3.prol, align 2
+  %2 = getelementptr i16, ptr %pSrc, i32 3
+  %add.ptr3.val14.prol = load i16, ptr %2, align 2
+  %conv.i15.prol = sext i16 %add.ptr3.val14.prol to i32
+  %shl.i16.prol = shl nsw i32 %conv.i15.prol, 16
+  %conv22.i17.prol = zext i16 %add.ptr3.val.prol to i32
+  %shl.prol = shl nuw i32 %conv22.i.prol, 16
+  %shl5.prol = shl nuw i32 %conv22.i17.prol, 16
+  %incdec.ptr.prol = getelementptr inbounds i32, ptr %pDst, i32 1
+  store i32 %shl.prol, ptr %pDst, align 4
+  %incdec.ptr7.prol = getelementptr inbounds i32, ptr %pDst, i32 2
+  store i32 %shl.i.prol, ptr %incdec.ptr.prol, align 4
+  %incdec.ptr8.prol = getelementptr inbounds i32, ptr %pDst, i32 3
+  store i32 %shl5.prol, ptr %incdec.ptr7.prol, align 4
+  %incdec.ptr9.prol = getelementptr inbounds i32, ptr %pDst, i32 4
+  store i32 %shl.i16.prol, ptr %incdec.ptr8.prol, align 4
+  %dec.prol = add nsw i32 %shr, -1
+  %prol.iter.cmp.not = icmp eq i32 %xtraiter, 1
+  br i1 %prol.iter.cmp.not, label %while.body.prol.loopexit, label %while.body.prol.1
+
+while.body.prol.1:                                ; preds = %while.body.prol
+  %pIn.0.val.prol.1 = load i16, ptr %add.ptr2.prol, align 2
+  %3 = getelementptr i16, ptr %pSrc, i32 5
+  %pIn.0.val13.prol.1 = load i16, ptr %3, align 2
+  %conv.i.prol.1 = sext i16 %pIn.0.val13.prol.1 to i32
+  %shl.i.prol.1 = shl nsw i32 %conv.i.prol.1, 16
+  %conv22.i.prol.1 = zext i16 %pIn.0.val.prol.1 to i32
+  %add.ptr2.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 8
+  %add.ptr3.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 6
+  %add.ptr3.val.prol.1 = load i16, ptr %add.ptr3.prol.1, align 2
+  %4 = getelementptr i16, ptr %pSrc, i32 7
+  %add.ptr3.val14.prol.1 = load i16, ptr %4, align 2
+  %conv.i15.prol.1 = sext i16 %add.ptr3.val14.prol.1 to i32
+  %shl.i16.prol.1 = shl nsw i32 %conv.i15.prol.1, 16
+  %conv22.i17.prol.1 = zext i16 %add.ptr3.val.prol.1 to i32
+  %shl.prol.1 = shl nuw i32 %conv22.i.prol.1, 16
+  %shl5.prol.1 = shl nuw i32 %conv22.i17.prol.1, 16
+  %incdec.ptr.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 5
+  store i32 %shl.prol.1, ptr %incdec.ptr9.prol, align 4
+  %incdec.ptr7.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 6
+  store i32 %shl.i.prol.1, ptr %incdec.ptr.prol.1, align 4
+  %incdec.ptr8.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 7
+  store i32 %shl5.prol.1, ptr %incdec.ptr7.prol.1, align 4
+  %incdec.ptr9.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 8
+  store i32 %shl.i16.prol.1, ptr %incdec.ptr8.prol.1, align 4
+  %dec.prol.1 = add nsw i32 %shr, -2
+  %prol.iter.cmp.1.not = icmp eq i32 %xtraiter, 2
+  br i1 %prol.iter.cmp.1.not, label %while.body.prol.loopexit, label %while.body.prol.2
+
+while.body.prol.2:                                ; preds = %while.body.prol.1
+  %pIn.0.val.prol.2 = load i16, ptr %add.ptr2.prol.1, align 2
+  %5 = getelementptr i16, ptr %pSrc, i32 9
+  %pIn.0.val13.prol.2 = load i16, ptr %5, align 2
+  %conv.i.prol.2 = sext i16 %pIn.0.val13.prol.2 to i32
+  %shl.i.prol.2 = shl nsw i32 %conv.i.prol.2, 16
+  %conv22.i.prol.2 = zext i16 %pIn.0.val.prol.2 to i32
+  %add.ptr2.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 12
+  %add.ptr3.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 10
+  %add.ptr3.val.prol.2 = load i16, ptr %add.ptr3.prol.2, align 2
+  %6 = getelementptr i16, ptr %pSrc, i32 11
+  %add.ptr3.val14.prol.2 = load i16, ptr %6, align 2
+  %conv.i15.prol.2 = sext i16 %add.ptr3.val14.prol.2 to i32
+  %shl.i16.prol.2 = shl nsw i32 %conv.i15.prol.2, 16
+  %conv22.i17.prol.2 = zext i16 %add.ptr3.val.prol.2 to i32
+  %shl.prol.2 = shl nuw i32 %conv22.i.prol.2, 16
+  %shl5.prol.2 = shl nuw i32 %conv22.i17.prol.2, 16
+  %incdec.ptr.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 9
+  store i32 %shl.prol.2, ptr %incdec.ptr9.prol.1, align 4
+  %incdec.ptr7.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 10
+  store i32 %shl.i.prol.2, ptr %incdec.ptr.prol.2, align 4
+  %incdec.ptr8.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 11
+  store i32 %shl5.prol.2, ptr %incdec.ptr7.prol.2, align 4
+  %incdec.ptr9.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 12
+  store i32 %shl.i16.prol.2, ptr %incdec.ptr8.prol.2, align 4
+  %dec.prol.2 = add nsw i32 %shr, -3
+  br label %while.body.prol.loopexit
+
+while.body.prol.loopexit:                         ; preds = %while.body.prol, %while.body.prol.1, %while.body.prol.2, %while.body.preheader
+  %add.ptr2.lcssa.unr = phi ptr [ undef, %while.body.preheader ], [ %add.ptr2.prol, %while.body.prol ], [ %add.ptr2.prol.1, %while.body.prol.1 ], [ %add.ptr2.prol.2, %while.body.prol.2 ]
+  %incdec.ptr9.lcssa.unr = phi ptr [ undef, %while.body.preheader ], [ %incdec.ptr9.prol, %while.body.prol ], [ %incdec.ptr9.prol.1, %while.body.prol.1 ], [ %incdec.ptr9.prol.2, %while.body.prol.2 ]
+  %pDst.addr.022.unr = phi ptr [ %pDst, %while.body.preheader ], [ %incdec.ptr9.prol, %while.body.prol ], [ %incdec.ptr9.prol.1, %while.body.prol.1 ], [ %incdec.ptr9.prol.2, %while.body.prol.2 ]
+  %blkCnt.021.unr = phi i32 [ %shr, %while.body.preheader ], [ %dec.prol, %while.body.prol ], [ %dec.prol.1, %while.body.prol.1 ], [ %dec.prol.2, %while.body.prol.2 ]
+  %pIn.020.unr = phi ptr [ %pSrc, %while.body.preheader ], [ %add.ptr2.prol, %while.body.prol ], [ %add.ptr2.prol.1, %while.body.prol.1 ], [ %add.ptr2.prol.2, %while.body.prol.2 ]
+  %7 = icmp ult i32 %0, 3
+  br i1 %7, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.body.prol.loopexit, %while.body
+  %pDst.addr.022 = phi ptr [ %incdec.ptr9.3, %while.body ], [ %pDst.addr.022.unr, %while.body.prol.loopexit ]
+  %blkCnt.021 = phi i32 [ %dec.3, %while.body ], [ %blkCnt.021.unr, %while.body.prol.loopexit ]
+  %pIn.020 = phi ptr [ %add.ptr2.3, %while.body ], [ %pIn.020.unr, %while.body.prol.loopexit ]
+  %pIn.0.val = load i16, ptr %pIn.020, align 2
+  %8 = getelementptr i8, ptr %pIn.020, i32 2
+  %pIn.0.val13 = load i16, ptr %8, align 2
+  %conv.i = sext i16 %pIn.0.val13 to i32
+  %shl.i = shl nsw i32 %conv.i, 16
+  %conv22.i = zext i16 %pIn.0.val to i32
+  %add.ptr2 = getelementptr inbounds i16, ptr %pIn.020, i32 4
+  %add.ptr3 = getelementptr inbounds i16, ptr %pIn.020, i32 2
+  %add.ptr3.val = load i16, ptr %add.ptr3, align 2
+  %9 = getelementptr i16, ptr %pIn.020, i32 3
+  %add.ptr3.val14 = load i16, ptr %9, align 2
+  %conv.i15 = sext i16 %add.ptr3.val14 to i32
+  %shl.i16 = shl nsw i32 %conv.i15, 16
+  %conv22.i17 = zext i16 %add.ptr3.val to i32
+  %shl = shl nuw i32 %conv22.i, 16
+  %shl5 = shl nuw i32 %conv22.i17, 16
+  %incdec.ptr = getelementptr inbounds i32, ptr %pDst.addr.022, i32 1
+  store i32 %shl, ptr %pDst.addr.022, align 4
+  %incdec.ptr7 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 2
+  store i32 %shl.i, ptr %incdec.ptr, align 4
+  %incdec.ptr8 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 3
+  store i32 %shl5, ptr %incdec.ptr7, align 4
+  %incdec.ptr9 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 4
+  store i32 %shl.i16, ptr %incdec.ptr8, align 4
+  %pIn.0.val.1 = load i16, ptr %add.ptr2, align 2
+  %10 = getelementptr i16, ptr %pIn.020, i32 5
+  %pIn.0.val13.1 = load i16, ptr %10, align 2
+  %conv.i.1 = sext i16 %pIn.0.val13.1 to i32
+  %shl.i.1 = shl nsw i32 %conv.i.1, 16
+  %conv22.i.1 = zext i16 %pIn.0.val.1 to i32
+  %add.ptr2.1 = getelementptr inbounds i16, ptr %pIn.020, i32 8
+  %add.ptr3.1 = getelementptr inbounds i16, ptr %pIn.020, i32 6
+  %add.ptr3.val.1 = load i16, ptr %add.ptr3.1, align 2
+  %11 = getelementptr i16, ptr %pIn.020, i32 7
+  %add.ptr3.val14.1 = load i16, ptr %11, align 2
+  %conv.i15.1 = sext i16 %add.ptr3.val14.1 to i32
+  %shl.i16.1 = shl nsw i32 %conv.i15.1, 16
+  %conv22.i17.1 = zext i16 %add.ptr3.val.1 to i32
+  %shl.1 = shl nuw i32 %conv22.i.1, 16
+  %shl5.1 = shl nuw i32 %conv22.i17.1, 16
+  %incdec.ptr.1 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 5
+  store i32 %shl.1, ptr %incdec.ptr9, align 4
+  %incdec.ptr7.1 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 6
+  store i32 %shl.i.1, ptr %incdec.ptr.1, align 4
+  %incdec.ptr8.1 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 7
+  store i32 %shl5.1, ptr %incdec.ptr7.1, align 4
+  %incdec.ptr9.1 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 8
+  store i32 %shl.i16.1, ptr %incdec.ptr8.1, align 4
+  %pIn.0.val.2 = load i16, ptr %add.ptr2.1, align 2
+  %12 = getelementptr i16, ptr %pIn.020, i32 9
+  %pIn.0.val13.2 = load i16, ptr %12, align 2
+  %conv.i.2 = sext i16 %pIn.0.val13.2 to i32
+  %shl.i.2 = shl nsw i32 %conv.i.2, 16
+  %conv22.i.2 = zext i16 %pIn.0.val.2 to i32
+  %add.ptr2.2 = getelementptr inbounds i16, ptr %pIn.020, i32 12
+  %add.ptr3.2 = getelementptr inbounds i16, ptr %pIn.020, i32 10
+  %add.ptr3.val.2 = load i16, ptr %add.ptr3.2, align 2
+  %13 = getelementptr i16, ptr %pIn.020, i32 11
+  %add.ptr3.val14.2 = load i16, ptr %13, align 2
+  %conv.i15.2 = sext i16 %add.ptr3.val14.2 to i32
+  %shl.i16.2 = shl nsw i32 %conv.i15.2, 16
+  %conv22.i17.2 = zext i16 %add.ptr3.val.2 to i32
+  %shl.2 = shl nuw i32 %conv22.i.2, 16
+  %shl5.2 = shl nuw i32 %conv22.i17.2, 16
+  %incdec.ptr.2 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 9
+  store i32 %shl.2, ptr %incdec.ptr9.1, align 4
+  %incdec.ptr7.2 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 10
+  store i32 %shl.i.2, ptr %incdec.ptr.2, align 4
+  %incdec.ptr8.2 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 11
+  store i32 %shl5.2, ptr %incdec.ptr7.2, align 4
+  %incdec.ptr9.2 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 12
+  store i32 %shl.i16.2, ptr %incdec.ptr8.2, align 4
+  %pIn.0.val.3 = load i16, ptr %add.ptr2.2, align 2
+  %14 = getelementptr i16, ptr %pIn.020, i32 13
+  %pIn.0.val13.3 = load i16, ptr %14, align 2
+  %conv.i.3 = sext i16 %pIn.0.val13.3 to i32
+  %shl.i.3 = shl nsw i32 %conv.i.3, 16
+  %conv22.i.3 = zext i16 %pIn.0.val.3 to i32
+  %add.ptr2.3 = getelementptr inbounds i16, ptr %pIn.020, i32 16
+  %add.ptr3.3 = getelementptr inbounds i16, ptr %pIn.020, i32 14
+  %add.ptr3.val.3 = load i16, ptr %add.ptr3.3, align 2
+  %15 = getelementptr i16, ptr %pIn.020, i32 15
+  %add.ptr3.val14.3 = load i16, ptr %15, align 2
+  %conv.i15.3 = sext i16 %add.ptr3.val14.3 to i32
+  %shl.i16.3 = shl nsw i32 %conv.i15.3, 16
+  %conv22.i17.3 = zext i16 %add.ptr3.val.3 to i32
+  %shl.3 = shl nuw i32 %conv22.i.3, 16
+  %shl5.3 = shl nuw i32 %conv22.i17.3, 16
+  %incdec.ptr.3 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 13
+  store i32 %shl.3, ptr %incdec.ptr9.2, align 4
+  %incdec.ptr7.3 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 14
+  store i32 %shl.i.3, ptr %incdec.ptr.3, align 4
+  %incdec.ptr8.3 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 15
+  store i32 %shl5.3, ptr %incdec.ptr7.3, align 4
+  %incdec.ptr9.3 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 16
+  store i32 %shl.i16.3, ptr %incdec.ptr8.3, align 4
+  %dec.3 = add nsw i32 %blkCnt.021, -4
+  %cmp.not.3 = icmp eq i32 %dec.3, 0
+  br i1 %cmp.not.3, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body.prol.loopexit, %while.body, %entry
+  %pIn.0.lcssa = phi ptr [ %pSrc, %entry ], [ %add.ptr2.lcssa.unr, %while.body.prol.loopexit ], [ %add.ptr2.3, %while.body ]
+  %pDst.addr.0.lcssa = phi ptr [ %pDst, %entry ], [ %incdec.ptr9.lcssa.unr, %while.body.prol.loopexit ], [ %incdec.ptr9.3, %while.body ]
+  %rem = and i32 %blockSize, 3
+  %cmp11.not24 = icmp eq i32 %rem, 0
+  br i1 %cmp11.not24, label %while.end17, label %while.body12
+
+while.body12:                                     ; preds = %while.end
+  %16 = load i16, ptr %pIn.0.lcssa, align 2
+  %conv = sext i16 %16 to i32
+  %shl14 = shl nsw i32 %conv, 16
+  store i32 %shl14, ptr %pDst.addr.0.lcssa, align 4
+  %cmp11.not = icmp eq i32 %rem, 1
+  br i1 %cmp11.not, label %while.end17, label %while.body12.1
+
+while.body12.1:                                   ; preds = %while.body12
+  %incdec.ptr15 = getelementptr inbounds i32, ptr %pDst.addr.0.lcssa, i32 1
+  %incdec.ptr13 = getelementptr inbounds i16, ptr %pIn.0.lcssa, i32 1
+  %17 = load i16, ptr %incdec.ptr13, align 2
+  %conv.1 = sext i16 %17 to i32
+  %shl14.1 = shl nsw i32 %conv.1, 16
+  store i32 %shl14.1, ptr %incdec.ptr15, align 4
+  %cmp11.not.1 = icmp eq i32 %rem, 2
+  br i1 %cmp11.not.1, label %while.end17, label %while.body12.2
+
+while.body12.2:                                   ; preds = %while.body12.1
+  %incdec.ptr15.1 = getelementptr inbounds i32, ptr %pDst.addr.0.lcssa, i32 2
+  %incdec.ptr13.1 = getelementptr inbounds i16, ptr %pIn.0.lcssa, i32 2
+  %18 = load i16, ptr %incdec.ptr13.1, align 2
+  %conv.2 = sext i16 %18 to i32
+  %shl14.2 = shl nsw i32 %conv.2, 16
+  store i32 %shl14.2, ptr %incdec.ptr15.1, align 4
+  br label %while.end17
+
+while.end17:                                      ; preds = %while.body12, %while.body12.1, %while.body12.2, %while.end
+  ret void
+}
+
+define void @arm_q15_to_q31_altorder(ptr nocapture noundef readonly %pSrc, ptr nocapture noundef writeonly %pDst, i32 noundef %blockSize) {
+; CHECK-LABEL: arm_q15_to_q31_altorder:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    mov r7, r2
+; CHECK-NEXT:    lsrs r3, r2, #2
+; CHECK-NEXT:    bne .LBB1_1
+; CHECK-NEXT:    b .LBB1_12
+; CHECK-NEXT:  .LBB1_1: @ %while.body.preheader
+; CHECK-NEXT:    movs r5, #3
+; CHECK-NEXT:    ands r5, r3
+; CHECK-NEXT:    subs r2, r3, #1
+; CHECK-NEXT:    cbz r5, .LBB1_4
+; CHECK-NEXT:  @ %bb.2: @ %while.body.prol
+; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
+; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldrh r2, [r0]
+; CHECK-NEXT:    ldrh r7, [r0, #2]
+; CHECK-NEXT:    ldrh r4, [r0, #4]
+; CHECK-NEXT:    ldrh r6, [r0, #6]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    lsls r7, r7, #16
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    stm r1!, {r2, r7}
+; CHECK-NEXT:    str r4, [r1]
+; CHECK-NEXT:    str r6, [r1, #4]
+; CHECK-NEXT:    subs r1, #8
+; CHECK-NEXT:    cmp r5, #1
+; CHECK-NEXT:    bne .LBB1_5
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:    adds r1, #16
+; CHECK-NEXT:    adds r0, #8
+; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:  .LBB1_4: @ %while.body.prol.loopexit
+; CHECK-NEXT:    cmp r2, #3
+; CHECK-NEXT:    bhs .LBB1_9
+; CHECK-NEXT:    b .LBB1_12
+; CHECK-NEXT:  .LBB1_5: @ %while.body.prol.1
+; CHECK-NEXT:    ldrh r2, [r0, #8]
+; CHECK-NEXT:    ldrh r4, [r0, #10]
+; CHECK-NEXT:    ldrh r6, [r0, #12]
+; CHECK-NEXT:    ldrh r7, [r0, #14]
+; CHECK-NEXT:    lsls r7, r7, #16
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #16]
+; CHECK-NEXT:    str r4, [r1, #20]
+; CHECK-NEXT:    str r6, [r1, #24]
+; CHECK-NEXT:    str r7, [r1, #28]
+; CHECK-NEXT:    cmp r5, #2
+; CHECK-NEXT:    bne .LBB1_7
+; CHECK-NEXT:  @ %bb.6:
+; CHECK-NEXT:    subs r3, r3, #2
+; CHECK-NEXT:    adds r1, #32
+; CHECK-NEXT:    adds r0, #16
+; CHECK-NEXT:    b .LBB1_8
+; CHECK-NEXT:  .LBB1_7: @ %while.body.prol.2
+; CHECK-NEXT:    ldrh r2, [r0, #16]
+; CHECK-NEXT:    ldrh r4, [r0, #18]
+; CHECK-NEXT:    ldrh r5, [r0, #20]
+; CHECK-NEXT:    ldrh r6, [r0, #22]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    adds r7, #32
+; CHECK-NEXT:    stm r7!, {r2, r4, r5, r6}
+; CHECK-NEXT:    subs r3, r3, #3
+; CHECK-NEXT:    adds r1, #48
+; CHECK-NEXT:    adds r0, #24
+; CHECK-NEXT:  .LBB1_8: @ %while.body.prol.loopexit
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    cmp r2, #3
+; CHECK-NEXT:    blo .LBB1_12
+; CHECK-NEXT:  .LBB1_9: @ %while.body.preheader1
+; CHECK-NEXT:    adds r0, #30
+; CHECK-NEXT:  .LBB1_10: @ %while.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    subs r2, #30
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    subs r4, #28
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    subs r5, #26
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    subs r6, #24
+; CHECK-NEXT:    ldrh r6, [r6]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    str r6, [r1, #12]
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    str r5, [r1, #8]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    str r4, [r1, #4]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1]
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    subs r2, #22
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    subs r4, #20
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    subs r5, #18
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    subs r6, #16
+; CHECK-NEXT:    ldrh r6, [r6]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    str r6, [r1, #28]
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    str r5, [r1, #24]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    str r4, [r1, #20]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #16]
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    subs r2, #14
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    subs r4, #12
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    subs r5, #10
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    subs r6, #8
+; CHECK-NEXT:    ldrh r6, [r6]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    str r6, [r1, #44]
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    str r5, [r1, #40]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    str r4, [r1, #36]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #32]
+; CHECK-NEXT:    subs r2, r0, #6
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    subs r4, r0, #4
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    subs r5, r0, #2
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    ldrh r6, [r0]
+; CHECK-NEXT:    lsls r6, r6, #16
+; CHECK-NEXT:    str r6, [r1, #60]
+; CHECK-NEXT:    lsls r5, r5, #16
+; CHECK-NEXT:    str r5, [r1, #56]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    str r4, [r1, #52]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #48]
+; CHECK-NEXT:    adds r1, #64
+; CHECK-NEXT:    adds r0, #32
+; CHECK-NEXT:    subs r3, r3, #4
+; CHECK-NEXT:    bne .LBB1_10
+; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
+; CHECK-NEXT:    subs r0, #30
+; CHECK-NEXT:  .LBB1_12: @ %while.end
+; CHECK-NEXT:    movs r2, #3
+; CHECK-NEXT:    ands r7, r2
+; CHECK-NEXT:    beq .LBB1_16
+; CHECK-NEXT:  @ %bb.13: @ %while.body12
+; CHECK-NEXT:    ldrh r2, [r0]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1]
+; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    beq .LBB1_16
+; CHECK-NEXT:  @ %bb.14: @ %while.body12.1
+; CHECK-NEXT:    ldrh r2, [r0, #2]
+; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    str r2, [r1, #4]
+; CHECK-NEXT:    cmp r7, #2
+; CHECK-NEXT:    beq .LBB1_16
+; CHECK-NEXT:  @ %bb.15: @ %while.body12.2
+; CHECK-NEXT:    ldrh r0, [r0, #4]
+; CHECK-NEXT:    lsls r0, r0, #16
+; CHECK-NEXT:    str r0, [r1, #8]
+; CHECK-NEXT:  .LBB1_16: @ %while.end17
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+entry:
+  %cmp.not18 = icmp ult i32 %blockSize, 4
+  br i1 %cmp.not18, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  %shr = lshr i32 %blockSize, 2
+  %0 = add nsw i32 %shr, -1
+  %xtraiter = and i32 %shr, 3
+  %lcmp.mod.not = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod.not, label %while.body.prol.loopexit, label %while.body.prol
+
+while.body.prol:                                  ; preds = %while.body.preheader
+  %arrayidx.i.prol = getelementptr inbounds i16, ptr %pSrc, i32 1
+  %1 = load i16, ptr %arrayidx.i.prol, align 2
+  %conv.i.prol = sext i16 %1 to i32
+  %shl.i.prol = shl nsw i32 %conv.i.prol, 16
+  %2 = load i16, ptr %pSrc, align 2
+  %conv22.i.prol = zext i16 %2 to i32
+  %add.ptr2.prol = getelementptr inbounds i16, ptr %pSrc, i32 4
+  %add.ptr3.prol = getelementptr inbounds i16, ptr %pSrc, i32 2
+  %arrayidx.i13.prol = getelementptr inbounds i16, ptr %pSrc, i32 3
+  %3 = load i16, ptr %arrayidx.i13.prol, align 2
+  %conv.i14.prol = sext i16 %3 to i32
+  %shl.i15.prol = shl nsw i32 %conv.i14.prol, 16
+  %4 = load i16, ptr %add.ptr3.prol, align 2
+  %conv22.i16.prol = zext i16 %4 to i32
+  %shl.prol = shl nuw i32 %conv22.i.prol, 16
+  %shl5.prol = shl nuw i32 %conv22.i16.prol, 16
+  %incdec.ptr.prol = getelementptr inbounds i32, ptr %pDst, i32 1
+  store i32 %shl.prol, ptr %pDst, align 4
+  %incdec.ptr7.prol = getelementptr inbounds i32, ptr %pDst, i32 2
+  store i32 %shl.i.prol, ptr %incdec.ptr.prol, align 4
+  %incdec.ptr8.prol = getelementptr inbounds i32, ptr %pDst, i32 3
+  store i32 %shl5.prol, ptr %incdec.ptr7.prol, align 4
+  %incdec.ptr9.prol = getelementptr inbounds i32, ptr %pDst, i32 4
+  store i32 %shl.i15.prol, ptr %incdec.ptr8.prol, align 4
+  %dec.prol = add nsw i32 %shr, -1
+  %prol.iter.cmp.not = icmp eq i32 %xtraiter, 1
+  br i1 %prol.iter.cmp.not, label %while.body.prol.loopexit, label %while.body.prol.1
+
+while.body.prol.1:                                ; preds = %while.body.prol
+  %arrayidx.i.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 5
+  %5 = load i16, ptr %arrayidx.i.prol.1, align 2
+  %conv.i.prol.1 = sext i16 %5 to i32
+  %shl.i.prol.1 = shl nsw i32 %conv.i.prol.1, 16
+  %6 = load i16, ptr %add.ptr2.prol, align 2
+  %conv22.i.prol.1 = zext i16 %6 to i32
+  %add.ptr2.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 8
+  %add.ptr3.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 6
+  %arrayidx.i13.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 7
+  %7 = load i16, ptr %arrayidx.i13.prol.1, align 2
+  %conv.i14.prol.1 = sext i16 %7 to i32
+  %shl.i15.prol.1 = shl nsw i32 %conv.i14.prol.1, 16
+  %8 = load i16, ptr %add.ptr3.prol.1, align 2
+  %conv22.i16.prol.1 = zext i16 %8 to i32
+  %shl.prol.1 = shl nuw i32 %conv22.i.prol.1, 16
+  %shl5.prol.1 = shl nuw i32 %conv22.i16.prol.1, 16
+  %incdec.ptr.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 5
+  store i32 %shl.prol.1, ptr %incdec.ptr9.prol, align 4
+  %incdec.ptr7.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 6
+  store i32 %shl.i.prol.1, ptr %incdec.ptr.prol.1, align 4
+  %incdec.ptr8.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 7
+  store i32 %shl5.prol.1, ptr %incdec.ptr7.prol.1, align 4
+  %incdec.ptr9.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 8
+  store i32 %shl.i15.prol.1, ptr %incdec.ptr8.prol.1, align 4
+  %dec.prol.1 = add nsw i32 %shr, -2
+  %prol.iter.cmp.1.not = icmp eq i32 %xtraiter, 2
+  br i1 %prol.iter.cmp.1.not, label %while.body.prol.loopexit, label %while.body.prol.2
+
+while.body.prol.2:                                ; preds = %while.body.prol.1
+  %arrayidx.i.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 9
+  %9 = load i16, ptr %arrayidx.i.prol.2, align 2
+  %conv.i.prol.2 = sext i16 %9 to i32
+  %shl.i.prol.2 = shl nsw i32 %conv.i.prol.2, 16
+  %10 = load i16, ptr %add.ptr2.prol.1, align 2
+  %conv22.i.prol.2 = zext i16 %10 to i32
+  %add.ptr2.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 12
+  %add.ptr3.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 10
+  %arrayidx.i13.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 11
+  %11 = load i16, ptr %arrayidx.i13.prol.2, align 2
+  %conv.i14.prol.2 = sext i16 %11 to i32
+  %shl.i15.prol.2 = shl nsw i32 %conv.i14.prol.2, 16
+  %12 = load i16, ptr %add.ptr3.prol.2, align 2
+  %conv22.i16.prol.2 = zext i16 %12 to i32
+  %shl.prol.2 = shl nuw i32 %conv22.i.prol.2, 16
+  %shl5.prol.2 = shl nuw i32 %conv22.i16.prol.2, 16
+  %incdec.ptr.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 9
+  store i32 %shl.prol.2, ptr %incdec.ptr9.prol.1, align 4
+  %incdec.ptr7.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 10
+  store i32 %shl.i.prol.2, ptr %incdec.ptr.prol.2, align 4
+  %incdec.ptr8.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 11
+  store i32 %shl5.prol.2, ptr %incdec.ptr7.prol.2, align 4
+  %incdec.ptr9.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 12
+  store i32 %shl.i15.prol.2, ptr %incdec.ptr8.prol.2, align 4
+  %dec.prol.2 = add nsw i32 %shr, -3
+  br label %while.body.prol.loopexit
+
+while.body.prol.loopexit:                         ; preds = %while.body.prol, %while.body.prol.1, %while.body.prol.2, %while.body.preheader
+  %add.ptr2.lcssa.unr = phi ptr [ undef, %while.body.preheader ], [ %add.ptr2.prol, %while.body.prol ], [ %add.ptr2.prol.1, %while.body.prol.1 ], [ %add.ptr2.prol.2, %while.body.prol.2 ]
+  %incdec.ptr9.lcssa.unr = phi ptr [ undef, %while.body.preheader ], [ %incdec.ptr9.prol, %while.body.prol ], [ %incdec.ptr9.prol.1, %while.body.prol.1 ], [ %incdec.ptr9.prol.2, %while.body.prol.2 ]
+  %pDst.addr.021.unr = phi ptr [ %pDst, %while.body.preheader ], [ %incdec.ptr9.prol, %while.body.prol ], [ %incdec.ptr9.prol.1, %while.body.prol.1 ], [ %incdec.ptr9.prol.2, %while.body.prol.2 ]
+  %blkCnt.020.unr = phi i32 [ %shr, %while.body.preheader ], [ %dec.prol, %while.body.prol ], [ %dec.prol.1, %while.body.prol.1 ], [ %dec.prol.2, %while.body.prol.2 ]
+  %pIn.019.unr = phi ptr [ %pSrc, %while.body.preheader ], [ %add.ptr2.prol, %while.body.prol ], [ %add.ptr2.prol.1, %while.body.prol.1 ], [ %add.ptr2.prol.2, %while.body.prol.2 ]
+  %13 = icmp ult i32 %0, 3
+  br i1 %13, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.body.prol.loopexit, %while.body
+  %pDst.addr.021 = phi ptr [ %incdec.ptr9.3, %while.body ], [ %pDst.addr.021.unr, %while.body.prol.loopexit ]
+  %blkCnt.020 = phi i32 [ %dec.3, %while.body ], [ %blkCnt.020.unr, %while.body.prol.loopexit ]
+  %pIn.019 = phi ptr [ %add.ptr2.3, %while.body ], [ %pIn.019.unr, %while.body.prol.loopexit ]
+  %arrayidx.i = getelementptr inbounds i16, ptr %pIn.019, i32 1
+  %14 = load i16, ptr %arrayidx.i, align 2
+  %conv.i = sext i16 %14 to i32
+  %shl.i = shl nsw i32 %conv.i, 16
+  %15 = load i16, ptr %pIn.019, align 2
+  %conv22.i = zext i16 %15 to i32
+  %add.ptr2 = getelementptr inbounds i16, ptr %pIn.019, i32 4
+  %add.ptr3 = getelementptr inbounds i16, ptr %pIn.019, i32 2
+  %arrayidx.i13 = getelementptr inbounds i16, ptr %pIn.019, i32 3
+  %16 = load i16, ptr %arrayidx.i13, align 2
+  %conv.i14 = sext i16 %16 to i32
+  %shl.i15 = shl nsw i32 %conv.i14, 16
+  %17 = load i16, ptr %add.ptr3, align 2
+  %conv22.i16 = zext i16 %17 to i32
+  %shl = shl nuw i32 %conv22.i, 16
+  %shl5 = shl nuw i32 %conv22.i16, 16
+  %incdec.ptr = getelementptr inbounds i32, ptr %pDst.addr.021, i32 1
+  store i32 %shl, ptr %pDst.addr.021, align 4
+  %incdec.ptr7 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 2
+  store i32 %shl.i, ptr %incdec.ptr, align 4
+  %incdec.ptr8 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 3
+  store i32 %shl5, ptr %incdec.ptr7, align 4
+  %incdec.ptr9 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 4
+  store i32 %shl.i15, ptr %incdec.ptr8, align 4
+  %arrayidx.i.1 = getelementptr inbounds i16, ptr %pIn.019, i32 5
+  %18 = load i16, ptr %arrayidx.i.1, align 2
+  %conv.i.1 = sext i16 %18 to i32
+  %shl.i.1 = shl nsw i32 %conv.i.1, 16
+  %19 = load i16, ptr %add.ptr2, align 2
+  %conv22.i.1 = zext i16 %19 to i32
+  %add.ptr2.1 = getelementptr inbounds i16, ptr %pIn.019, i32 8
+  %add.ptr3.1 = getelementptr inbounds i16, ptr %pIn.019, i32 6
+  %arrayidx.i13.1 = getelementptr inbounds i16, ptr %pIn.019, i32 7
+  %20 = load i16, ptr %arrayidx.i13.1, align 2
+  %conv.i14.1 = sext i16 %20 to i32
+  %shl.i15.1 = shl nsw i32 %conv.i14.1, 16
+  %21 = load i16, ptr %add.ptr3.1, align 2
+  %conv22.i16.1 = zext i16 %21 to i32
+  %shl.1 = shl nuw i32 %conv22.i.1, 16
+  %shl5.1 = shl nuw i32 %conv22.i16.1, 16
+  %incdec.ptr.1 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 5
+  store i32 %shl.1, ptr %incdec.ptr9, align 4
+  %incdec.ptr7.1 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 6
+  store i32 %shl.i.1, ptr %incdec.ptr.1, align 4
+  %incdec.ptr8.1 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 7
+  store i32 %shl5.1, ptr %incdec.ptr7.1, align 4
+  %incdec.ptr9.1 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 8
+  store i32 %shl.i15.1, ptr %incdec.ptr8.1, align 4
+  %arrayidx.i.2 = getelementptr inbounds i16, ptr %pIn.019, i32 9
+  %22 = load i16, ptr %arrayidx.i.2, align 2
+  %conv.i.2 = sext i16 %22 to i32
+  %shl.i.2 = shl nsw i32 %conv.i.2, 16
+  %23 = load i16, ptr %add.ptr2.1, align 2
+  %conv22.i.2 = zext i16 %23 to i32
+  %add.ptr2.2 = getelementptr inbounds i16, ptr %pIn.019, i32 12
+  %add.ptr3.2 = getelementptr inbounds i16, ptr %pIn.019, i32 10
+  %arrayidx.i13.2 = getelementptr inbounds i16, ptr %pIn.019, i32 11
+  %24 = load i16, ptr %arrayidx.i13.2, align 2
+  %conv.i14.2 = sext i16 %24 to i32
+  %shl.i15.2 = shl nsw i32 %conv.i14.2, 16
+  %25 = load i16, ptr %add.ptr3.2, align 2
+  %conv22.i16.2 = zext i16 %25 to i32
+  %shl.2 = shl nuw i32 %conv22.i.2, 16
+  %shl5.2 = shl nuw i32 %conv22.i16.2, 16
+  %incdec.ptr.2 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 9
+  store i32 %shl.2, ptr %incdec.ptr9.1, align 4
+  %incdec.ptr7.2 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 10
+  store i32 %shl.i.2, ptr %incdec.ptr.2, align 4
+  %incdec.ptr8.2 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 11
+  store i32 %shl5.2, ptr %incdec.ptr7.2, align 4
+  %incdec.ptr9.2 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 12
+  store i32 %shl.i15.2, ptr %incdec.ptr8.2, align 4
+  %arrayidx.i.3 = getelementptr inbounds i16, ptr %pIn.019, i32 13
+  %26 = load i16, ptr %arrayidx.i.3, align 2
+  %conv.i.3 = sext i16 %26 to i32
+  %shl.i.3 = shl nsw i32 %conv.i.3, 16
+  %27 = load i16, ptr %add.ptr2.2, align 2
+  %conv22.i.3 = zext i16 %27 to i32
+  %add.ptr2.3 = getelementptr inbounds i16, ptr %pIn.019, i32 16
+  %add.ptr3.3 = getelementptr inbounds i16, ptr %pIn.019, i32 14
+  %arrayidx.i13.3 = getelementptr inbounds i16, ptr %pIn.019, i32 15
+  %28 = load i16, ptr %arrayidx.i13.3, align 2
+  %conv.i14.3 = sext i16 %28 to i32
+  %shl.i15.3 = shl nsw i32 %conv.i14.3, 16
+  %29 = load i16, ptr %add.ptr3.3, align 2
+  %conv22.i16.3 = zext i16 %29 to i32
+  %shl.3 = shl nuw i32 %conv22.i.3, 16
+  %shl5.3 = shl nuw i32 %conv22.i16.3, 16
+  %incdec.ptr.3 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 13
+  store i32 %shl.3, ptr %incdec.ptr9.2, align 4
+  %incdec.ptr7.3 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 14
+  store i32 %shl.i.3, ptr %incdec.ptr.3, align 4
+  %incdec.ptr8.3 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 15
+  store i32 %shl5.3, ptr %incdec.ptr7.3, align 4
+  %incdec.ptr9.3 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 16
+  store i32 %shl.i15.3, ptr %incdec.ptr8.3, align 4
+  %dec.3 = add nsw i32 %blkCnt.020, -4
+  %cmp.not.3 = icmp eq i32 %dec.3, 0
+  br i1 %cmp.not.3, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body.prol.loopexit, %while.body, %entry
+  %pIn.0.lcssa = phi ptr [ %pSrc, %entry ], [ %add.ptr2.lcssa.unr, %while.body.prol.loopexit ], [ %add.ptr2.3, %while.body ]
+  %pDst.addr.0.lcssa = phi ptr [ %pDst, %entry ], [ %incdec.ptr9.lcssa.unr, %while.body.prol.loopexit ], [ %incdec.ptr9.3, %while.body ]
+  %rem = and i32 %blockSize, 3
+  %cmp11.not23 = icmp eq i32 %rem, 0
+  br i1 %cmp11.not23, label %while.end17, label %while.body12
+
+while.body12:                                     ; preds = %while.end
+  %30 = load i16, ptr %pIn.0.lcssa, align 2
+  %conv = sext i16 %30 to i32
+  %shl14 = shl nsw i32 %conv, 16
+  store i32 %shl14, ptr %pDst.addr.0.lcssa, align 4
+  %cmp11.not = icmp eq i32 %rem, 1
+  br i1 %cmp11.not, label %while.end17, label %while.body12.1
+
+while.body12.1:                                   ; preds = %while.body12
+  %incdec.ptr15 = getelementptr inbounds i32, ptr %pDst.addr.0.lcssa, i32 1
+  %incdec.ptr13 = getelementptr inbounds i16, ptr %pIn.0.lcssa, i32 1
+  %31 = load i16, ptr %incdec.ptr13, align 2
+  %conv.1 = sext i16 %31 to i32
+  %shl14.1 = shl nsw i32 %conv.1, 16
+  store i32 %shl14.1, ptr %incdec.ptr15, align 4
+  %cmp11.not.1 = icmp eq i32 %rem, 2
+  br i1 %cmp11.not.1, label %while.end17, label %while.body12.2
+
+while.body12.2:                                   ; preds = %while.body12.1
+  %incdec.ptr15.1 = getelementptr inbounds i32, ptr %pDst.addr.0.lcssa, i32 2
+  %incdec.ptr13.1 = getelementptr inbounds i16, ptr %pIn.0.lcssa, i32 2
+  %32 = load i16, ptr %incdec.ptr13.1, align 2
+  %conv.2 = sext i16 %32 to i32
+  %shl14.2 = shl nsw i32 %conv.2, 16
+  store i32 %shl14.2, ptr %incdec.ptr15.1, align 4
+  br label %while.end17
+
+while.end17:                                      ; preds = %while.body12, %while.body12.1, %while.body12.2, %while.end
+  ret void
+}


        


More information about the llvm-commits mailing list