[llvm] c0eb942 - [Test] Precommit tests for PR50555

Mon Aug 9 06:56:41 PDT 2021

Author: Anton Afanasyev
Date: 2021-08-09T16:55:27+03:00
New Revision: c0eb94231ebd96ce40b2d4d8955028950d940360

URL: https://github.com/llvm/llvm-project/commit/c0eb94231ebd96ce40b2d4d8955028950d940360
DIFF: https://github.com/llvm/llvm-project/commit/c0eb94231ebd96ce40b2d4d8955028950d940360.diff

LOG: [Test] Precommit tests for PR50555

Added: 
    llvm/test/Transforms/InstCombine/pr50555.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/InstCombine/pr50555.ll b/llvm/test/Transforms/InstCombine/pr50555.ll
new file mode 100644
index 0000000000000..f779806585792

--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr50555.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @trunc_one_add(i16* %a, i8 %b) {
+; CHECK-LABEL: @trunc_one_add(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ZEXT]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[SHR]], [[ZEXT]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16
+; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+  %zext = zext i8 %b to i32
+  %shr = lshr i32 %zext, 1
+  %add = add nsw i32 %zext, %shr
+  %trunc = trunc i32 %add to i16
+  store i16 %trunc, i16* %a, align 2
+  ret void
+}
+
+define void @trunc_two_adds(i16* %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @trunc_two_adds(
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32
+; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i32 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[ADD1]], 1
+; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[SHR1]]
+; CHECK-NEXT:    [[SHR2:%.*]] = lshr i32 [[ADD2]], 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16
+; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+  %zext1 = zext i8 %b to i32
+  %zext2 = zext i8 %c to i32
+  %add1 = add nuw nsw i32 %zext1, %zext2
+  %shr1 = lshr i32 %add1, 1
+  %add2 = add nuw nsw i32 %add1, %shr1
+  %shr2 = lshr i32 %add2, 2
+  %trunc = trunc i32 %shr2 to i16
+  store i16 %trunc, i16* %a, align 2
+  ret void
+}

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
new file mode 100644
index 0000000000000..7b9efd6110628
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
@@ -0,0 +1,522 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-- -instcombine -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-- -mcpu=corei7-avx -instcombine -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=AVX
+
+define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) {
+; SSE-LABEL: @trunc_through_one_add(
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>*
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; SSE-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1>
+; SSE-NEXT:    [[TMP7:%.*]] = add nuw nsw <4 x i32> [[TMP6]], [[TMP5]]
+; SSE-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16>
+; SSE-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>*
+; SSE-NEXT:    store <4 x i16> [[TMP9]], <4 x i16>* [[TMP10]], align 2
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4
+; SSE-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <4 x i8>*
+; SSE-NEXT:    [[TMP14:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
+; SSE-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32>
+; SSE-NEXT:    [[TMP16:%.*]] = lshr <4 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1>
+; SSE-NEXT:    [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP16]], [[TMP15]]
+; SSE-NEXT:    [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
+; SSE-NEXT:    [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>*
+; SSE-NEXT:    store <4 x i16> [[TMP19]], <4 x i16>* [[TMP20]], align 2
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP21]] to <4 x i8>*
+; SSE-NEXT:    [[TMP24:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1
+; SSE-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
+; SSE-NEXT:    [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1>
+; SSE-NEXT:    [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]]
+; SSE-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16>
+; SSE-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP22]] to <4 x i16>*
+; SSE-NEXT:    store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2
+; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP33:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>*
+; SSE-NEXT:    [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1
+; SSE-NEXT:    [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32>
+; SSE-NEXT:    [[TMP36:%.*]] = lshr <4 x i32> [[TMP35]], <i32 1, i32 1, i32 1, i32 1>
+; SSE-NEXT:    [[TMP37:%.*]] = add nuw nsw <4 x i32> [[TMP36]], [[TMP35]]
+; SSE-NEXT:    [[TMP38:%.*]] = lshr <4 x i32> [[TMP37]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP39:%.*]] = trunc <4 x i32> [[TMP38]] to <4 x i16>
+; SSE-NEXT:    [[TMP40:%.*]] = bitcast i16* [[TMP32]] to <4 x i16>*
+; SSE-NEXT:    store <4 x i16> [[TMP39]], <4 x i16>* [[TMP40]], align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @trunc_through_one_add(
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX-NEXT:    [[TMP7:%.*]] = add nuw nsw <8 x i32> [[TMP6]], [[TMP5]]
+; AVX-NEXT:    [[TMP8:%.*]] = lshr <8 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16>
+; AVX-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
+; AVX-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <8 x i8>*
+; AVX-NEXT:    [[TMP14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
+; AVX-NEXT:    [[TMP15:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i32>
+; AVX-NEXT:    [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX-NEXT:    [[TMP17:%.*]] = add nuw nsw <8 x i32> [[TMP16]], [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = lshr <8 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; AVX-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i16>
+; AVX-NEXT:    [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>*
+; AVX-NEXT:    store <8 x i16> [[TMP19]], <8 x i16>* [[TMP20]], align 2
+; AVX-NEXT:    ret void
+;
+  %3 = load i8, i8* %1, align 1
+  %4 = zext i8 %3 to i32
+  %5 = lshr i32 %4, 1
+  %6 = add nuw nsw i32 %5, %4
+  %7 = lshr i32 %6, 2
+  %8 = trunc i32 %7 to i16
+  store i16 %8, i16* %0, align 2
+  %9 = getelementptr inbounds i8, i8* %1, i64 1
+  %10 = load i8, i8* %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = lshr i32 %11, 1
+  %13 = add nuw nsw i32 %12, %11
+  %14 = lshr i32 %13, 2
+  %15 = trunc i32 %14 to i16
+  %16 = getelementptr inbounds i16, i16* %0, i64 1
+  store i16 %15, i16* %16, align 2
+  %17 = getelementptr inbounds i8, i8* %1, i64 2
+  %18 = load i8, i8* %17, align 1
+  %19 = zext i8 %18 to i32
+  %20 = lshr i32 %19, 1
+  %21 = add nuw nsw i32 %20, %19
+  %22 = lshr i32 %21, 2
+  %23 = trunc i32 %22 to i16
+  %24 = getelementptr inbounds i16, i16* %0, i64 2
+  store i16 %23, i16* %24, align 2
+  %25 = getelementptr inbounds i8, i8* %1, i64 3
+  %26 = load i8, i8* %25, align 1
+  %27 = zext i8 %26 to i32
+  %28 = lshr i32 %27, 1
+  %29 = add nuw nsw i32 %28, %27
+  %30 = lshr i32 %29, 2
+  %31 = trunc i32 %30 to i16
+  %32 = getelementptr inbounds i16, i16* %0, i64 3
+  store i16 %31, i16* %32, align 2
+  %33 = getelementptr inbounds i8, i8* %1, i64 4
+  %34 = load i8, i8* %33, align 1
+  %35 = zext i8 %34 to i32
+  %36 = lshr i32 %35, 1
+  %37 = add nuw nsw i32 %36, %35
+  %38 = lshr i32 %37, 2
+  %39 = trunc i32 %38 to i16
+  %40 = getelementptr inbounds i16, i16* %0, i64 4
+  store i16 %39, i16* %40, align 2
+  %41 = getelementptr inbounds i8, i8* %1, i64 5
+  %42 = load i8, i8* %41, align 1
+  %43 = zext i8 %42 to i32
+  %44 = lshr i32 %43, 1
+  %45 = add nuw nsw i32 %44, %43
+  %46 = lshr i32 %45, 2
+  %47 = trunc i32 %46 to i16
+  %48 = getelementptr inbounds i16, i16* %0, i64 5
+  store i16 %47, i16* %48, align 2
+  %49 = getelementptr inbounds i8, i8* %1, i64 6
+  %50 = load i8, i8* %49, align 1
+  %51 = zext i8 %50 to i32
+  %52 = lshr i32 %51, 1
+  %53 = add nuw nsw i32 %52, %51
+  %54 = lshr i32 %53, 2
+  %55 = trunc i32 %54 to i16
+  %56 = getelementptr inbounds i16, i16* %0, i64 6
+  store i16 %55, i16* %56, align 2
+  %57 = getelementptr inbounds i8, i8* %1, i64 7
+  %58 = load i8, i8* %57, align 1
+  %59 = zext i8 %58 to i32
+  %60 = lshr i32 %59, 1
+  %61 = add nuw nsw i32 %60, %59
+  %62 = lshr i32 %61, 2
+  %63 = trunc i32 %62 to i16
+  %64 = getelementptr inbounds i16, i16* %0, i64 7
+  store i16 %63, i16* %64, align 2
+  %65 = getelementptr inbounds i8, i8* %1, i64 8
+  %66 = load i8, i8* %65, align 1
+  %67 = zext i8 %66 to i32
+  %68 = lshr i32 %67, 1
+  %69 = add nuw nsw i32 %68, %67
+  %70 = lshr i32 %69, 2
+  %71 = trunc i32 %70 to i16
+  %72 = getelementptr inbounds i16, i16* %0, i64 8
+  store i16 %71, i16* %72, align 2
+  %73 = getelementptr inbounds i8, i8* %1, i64 9
+  %74 = load i8, i8* %73, align 1
+  %75 = zext i8 %74 to i32
+  %76 = lshr i32 %75, 1
+  %77 = add nuw nsw i32 %76, %75
+  %78 = lshr i32 %77, 2
+  %79 = trunc i32 %78 to i16
+  %80 = getelementptr inbounds i16, i16* %0, i64 9
+  store i16 %79, i16* %80, align 2
+  %81 = getelementptr inbounds i8, i8* %1, i64 10
+  %82 = load i8, i8* %81, align 1
+  %83 = zext i8 %82 to i32
+  %84 = lshr i32 %83, 1
+  %85 = add nuw nsw i32 %84, %83
+  %86 = lshr i32 %85, 2
+  %87 = trunc i32 %86 to i16
+  %88 = getelementptr inbounds i16, i16* %0, i64 10
+  store i16 %87, i16* %88, align 2
+  %89 = getelementptr inbounds i8, i8* %1, i64 11
+  %90 = load i8, i8* %89, align 1
+  %91 = zext i8 %90 to i32
+  %92 = lshr i32 %91, 1
+  %93 = add nuw nsw i32 %92, %91
+  %94 = lshr i32 %93, 2
+  %95 = trunc i32 %94 to i16
+  %96 = getelementptr inbounds i16, i16* %0, i64 11
+  store i16 %95, i16* %96, align 2
+  %97 = getelementptr inbounds i8, i8* %1, i64 12
+  %98 = load i8, i8* %97, align 1
+  %99 = zext i8 %98 to i32
+  %100 = lshr i32 %99, 1
+  %101 = add nuw nsw i32 %100, %99
+  %102 = lshr i32 %101, 2
+  %103 = trunc i32 %102 to i16
+  %104 = getelementptr inbounds i16, i16* %0, i64 12
+  store i16 %103, i16* %104, align 2
+  %105 = getelementptr inbounds i8, i8* %1, i64 13
+  %106 = load i8, i8* %105, align 1
+  %107 = zext i8 %106 to i32
+  %108 = lshr i32 %107, 1
+  %109 = add nuw nsw i32 %108, %107
+  %110 = lshr i32 %109, 2
+  %111 = trunc i32 %110 to i16
+  %112 = getelementptr inbounds i16, i16* %0, i64 13
+  store i16 %111, i16* %112, align 2
+  %113 = getelementptr inbounds i8, i8* %1, i64 14
+  %114 = load i8, i8* %113, align 1
+  %115 = zext i8 %114 to i32
+  %116 = lshr i32 %115, 1
+  %117 = add nuw nsw i32 %116, %115
+  %118 = lshr i32 %117, 2
+  %119 = trunc i32 %118 to i16
+  %120 = getelementptr inbounds i16, i16* %0, i64 14
+  store i16 %119, i16* %120, align 2
+  %121 = getelementptr inbounds i8, i8* %1, i64 15
+  %122 = load i8, i8* %121, align 1
+  %123 = zext i8 %122 to i32
+  %124 = lshr i32 %123, 1
+  %125 = add nuw nsw i32 %124, %123
+  %126 = lshr i32 %125, 2
+  %127 = trunc i32 %126 to i16
+  %128 = getelementptr inbounds i16, i16* %0, i64 15
+  store i16 %127, i16* %128, align 2
+  ret void
+}
+
+define void @trunc_through_two_adds(i16* noalias %0, i8* noalias readonly %1, i8* noalias readonly %2) {
+; SSE-LABEL: @trunc_through_two_adds(
+; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>*
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; SSE-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
+; SSE-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <4 x i8>*
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
+; SSE-NEXT:    [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
+; SSE-NEXT:    [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <4 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1>
+; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <4 x i32> [[TMP11]], [[TMP10]]
+; SSE-NEXT:    [[TMP13:%.*]] = lshr <4 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16>
+; SSE-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>*
+; SSE-NEXT:    store <4 x i16> [[TMP14]], <4 x i16>* [[TMP15]], align 2
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 4
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4
+; SSE-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
+; SSE-NEXT:    [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1
+; SSE-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32>
+; SSE-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>*
+; SSE-NEXT:    [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
+; SSE-NEXT:    [[TMP24:%.*]] = zext <4 x i8> [[TMP23]] to <4 x i32>
+; SSE-NEXT:    [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP24]], [[TMP21]]
+; SSE-NEXT:    [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1>
+; SSE-NEXT:    [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]]
+; SSE-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16>
+; SSE-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>*
+; SSE-NEXT:    store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2
+; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
+; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP34:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>*
+; SSE-NEXT:    [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
+; SSE-NEXT:    [[TMP36:%.*]] = zext <4 x i8> [[TMP35]] to <4 x i32>
+; SSE-NEXT:    [[TMP37:%.*]] = bitcast i8* [[TMP32]] to <4 x i8>*
+; SSE-NEXT:    [[TMP38:%.*]] = load <4 x i8>, <4 x i8>* [[TMP37]], align 1
+; SSE-NEXT:    [[TMP39:%.*]] = zext <4 x i8> [[TMP38]] to <4 x i32>
+; SSE-NEXT:    [[TMP40:%.*]] = add nuw nsw <4 x i32> [[TMP39]], [[TMP36]]
+; SSE-NEXT:    [[TMP41:%.*]] = lshr <4 x i32> [[TMP40]], <i32 1, i32 1, i32 1, i32 1>
+; SSE-NEXT:    [[TMP42:%.*]] = add nuw nsw <4 x i32> [[TMP41]], [[TMP40]]
+; SSE-NEXT:    [[TMP43:%.*]] = lshr <4 x i32> [[TMP42]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i16>
+; SSE-NEXT:    [[TMP45:%.*]] = bitcast i16* [[TMP33]] to <4 x i16>*
+; SSE-NEXT:    store <4 x i16> [[TMP44]], <4 x i16>* [[TMP45]], align 2
+; SSE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 12
+; SSE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP49:%.*]] = bitcast i8* [[TMP46]] to <4 x i8>*
+; SSE-NEXT:    [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1
+; SSE-NEXT:    [[TMP51:%.*]] = zext <4 x i8> [[TMP50]] to <4 x i32>
+; SSE-NEXT:    [[TMP52:%.*]] = bitcast i8* [[TMP47]] to <4 x i8>*
+; SSE-NEXT:    [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1
+; SSE-NEXT:    [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32>
+; SSE-NEXT:    [[TMP55:%.*]] = add nuw nsw <4 x i32> [[TMP54]], [[TMP51]]
+; SSE-NEXT:    [[TMP56:%.*]] = lshr <4 x i32> [[TMP55]], <i32 1, i32 1, i32 1, i32 1>
+; SSE-NEXT:    [[TMP57:%.*]] = add nuw nsw <4 x i32> [[TMP56]], [[TMP55]]
+; SSE-NEXT:    [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP59:%.*]] = trunc <4 x i32> [[TMP58]] to <4 x i16>
+; SSE-NEXT:    [[TMP60:%.*]] = bitcast i16* [[TMP48]] to <4 x i16>*
+; SSE-NEXT:    store <4 x i16> [[TMP59]], <4 x i16>* [[TMP60]], align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @trunc_through_two_adds(
+; AVX-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32>
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>*
+; AVX-NEXT:    [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i32>
+; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <8 x i32> [[TMP9]], [[TMP6]]
+; AVX-NEXT:    [[TMP11:%.*]] = lshr <8 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <8 x i32> [[TMP11]], [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = lshr <8 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; AVX-NEXT:    [[TMP14:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i16>
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
+; AVX-NEXT:    store <8 x i16> [[TMP14]], <8 x i16>* [[TMP15]], align 2
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; AVX-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>*
+; AVX-NEXT:    [[TMP20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP19]], align 1
+; AVX-NEXT:    [[TMP21:%.*]] = zext <8 x i8> [[TMP20]] to <8 x i32>
+; AVX-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>*
+; AVX-NEXT:    [[TMP23:%.*]] = load <8 x i8>, <8 x i8>* [[TMP22]], align 1
+; AVX-NEXT:    [[TMP24:%.*]] = zext <8 x i8> [[TMP23]] to <8 x i32>
+; AVX-NEXT:    [[TMP25:%.*]] = add nuw nsw <8 x i32> [[TMP24]], [[TMP21]]
+; AVX-NEXT:    [[TMP26:%.*]] = lshr <8 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX-NEXT:    [[TMP27:%.*]] = add nuw nsw <8 x i32> [[TMP26]], [[TMP25]]
+; AVX-NEXT:    [[TMP28:%.*]] = lshr <8 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; AVX-NEXT:    [[TMP29:%.*]] = trunc <8 x i32> [[TMP28]] to <8 x i16>
+; AVX-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>*
+; AVX-NEXT:    store <8 x i16> [[TMP29]], <8 x i16>* [[TMP30]], align 2
+; AVX-NEXT:    ret void
+;
+  %4 = load i8, i8* %1, align 1
+  %5 = zext i8 %4 to i32
+  %6 = load i8, i8* %2, align 1
+  %7 = zext i8 %6 to i32
+  %8 = add nuw nsw i32 %7, %5
+  %9 = lshr i32 %8, 1
+  %10 = add nuw nsw i32 %9, %8
+  %11 = lshr i32 %10, 2
+  %12 = trunc i32 %11 to i16
+  store i16 %12, i16* %0, align 2
+  %13 = getelementptr inbounds i8, i8* %1, i64 1
+  %14 = load i8, i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = getelementptr inbounds i8, i8* %2, i64 1
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = add nuw nsw i32 %18, %15
+  %20 = lshr i32 %19, 1
+  %21 = add nuw nsw i32 %20, %19
+  %22 = lshr i32 %21, 2
+  %23 = trunc i32 %22 to i16
+  %24 = getelementptr inbounds i16, i16* %0, i64 1
+  store i16 %23, i16* %24, align 2
+  %25 = getelementptr inbounds i8, i8* %1, i64 2
+  %26 = load i8, i8* %25, align 1
+  %27 = zext i8 %26 to i32
+  %28 = getelementptr inbounds i8, i8* %2, i64 2
+  %29 = load i8, i8* %28, align 1
+  %30 = zext i8 %29 to i32
+  %31 = add nuw nsw i32 %30, %27
+  %32 = lshr i32 %31, 1
+  %33 = add nuw nsw i32 %32, %31
+  %34 = lshr i32 %33, 2
+  %35 = trunc i32 %34 to i16
+  %36 = getelementptr inbounds i16, i16* %0, i64 2
+  store i16 %35, i16* %36, align 2
+  %37 = getelementptr inbounds i8, i8* %1, i64 3
+  %38 = load i8, i8* %37, align 1
+  %39 = zext i8 %38 to i32
+  %40 = getelementptr inbounds i8, i8* %2, i64 3
+  %41 = load i8, i8* %40, align 1
+  %42 = zext i8 %41 to i32
+  %43 = add nuw nsw i32 %42, %39
+  %44 = lshr i32 %43, 1
+  %45 = add nuw nsw i32 %44, %43
+  %46 = lshr i32 %45, 2
+  %47 = trunc i32 %46 to i16
+  %48 = getelementptr inbounds i16, i16* %0, i64 3
+  store i16 %47, i16* %48, align 2
+  %49 = getelementptr inbounds i8, i8* %1, i64 4
+  %50 = load i8, i8* %49, align 1
+  %51 = zext i8 %50 to i32
+  %52 = getelementptr inbounds i8, i8* %2, i64 4
+  %53 = load i8, i8* %52, align 1
+  %54 = zext i8 %53 to i32
+  %55 = add nuw nsw i32 %54, %51
+  %56 = lshr i32 %55, 1
+  %57 = add nuw nsw i32 %56, %55
+  %58 = lshr i32 %57, 2
+  %59 = trunc i32 %58 to i16
+  %60 = getelementptr inbounds i16, i16* %0, i64 4
+  store i16 %59, i16* %60, align 2
+  %61 = getelementptr inbounds i8, i8* %1, i64 5
+  %62 = load i8, i8* %61, align 1
+  %63 = zext i8 %62 to i32
+  %64 = getelementptr inbounds i8, i8* %2, i64 5
+  %65 = load i8, i8* %64, align 1
+  %66 = zext i8 %65 to i32
+  %67 = add nuw nsw i32 %66, %63
+  %68 = lshr i32 %67, 1
+  %69 = add nuw nsw i32 %68, %67
+  %70 = lshr i32 %69, 2
+  %71 = trunc i32 %70 to i16
+  %72 = getelementptr inbounds i16, i16* %0, i64 5
+  store i16 %71, i16* %72, align 2
+  %73 = getelementptr inbounds i8, i8* %1, i64 6
+  %74 = load i8, i8* %73, align 1
+  %75 = zext i8 %74 to i32
+  %76 = getelementptr inbounds i8, i8* %2, i64 6
+  %77 = load i8, i8* %76, align 1
+  %78 = zext i8 %77 to i32
+  %79 = add nuw nsw i32 %78, %75
+  %80 = lshr i32 %79, 1
+  %81 = add nuw nsw i32 %80, %79
+  %82 = lshr i32 %81, 2
+  %83 = trunc i32 %82 to i16
+  %84 = getelementptr inbounds i16, i16* %0, i64 6
+  store i16 %83, i16* %84, align 2
+  %85 = getelementptr inbounds i8, i8* %1, i64 7
+  %86 = load i8, i8* %85, align 1
+  %87 = zext i8 %86 to i32
+  %88 = getelementptr inbounds i8, i8* %2, i64 7
+  %89 = load i8, i8* %88, align 1
+  %90 = zext i8 %89 to i32
+  %91 = add nuw nsw i32 %90, %87
+  %92 = lshr i32 %91, 1
+  %93 = add nuw nsw i32 %92, %91
+  %94 = lshr i32 %93, 2
+  %95 = trunc i32 %94 to i16
+  %96 = getelementptr inbounds i16, i16* %0, i64 7
+  store i16 %95, i16* %96, align 2
+  %97 = getelementptr inbounds i8, i8* %1, i64 8
+  %98 = load i8, i8* %97, align 1
+  %99 = zext i8 %98 to i32
+  %100 = getelementptr inbounds i8, i8* %2, i64 8
+  %101 = load i8, i8* %100, align 1
+  %102 = zext i8 %101 to i32
+  %103 = add nuw nsw i32 %102, %99
+  %104 = lshr i32 %103, 1
+  %105 = add nuw nsw i32 %104, %103
+  %106 = lshr i32 %105, 2
+  %107 = trunc i32 %106 to i16
+  %108 = getelementptr inbounds i16, i16* %0, i64 8
+  store i16 %107, i16* %108, align 2
+  %109 = getelementptr inbounds i8, i8* %1, i64 9
+  %110 = load i8, i8* %109, align 1
+  %111 = zext i8 %110 to i32
+  %112 = getelementptr inbounds i8, i8* %2, i64 9
+  %113 = load i8, i8* %112, align 1
+  %114 = zext i8 %113 to i32
+  %115 = add nuw nsw i32 %114, %111
+  %116 = lshr i32 %115, 1
+  %117 = add nuw nsw i32 %116, %115
+  %118 = lshr i32 %117, 2
+  %119 = trunc i32 %118 to i16
+  %120 = getelementptr inbounds i16, i16* %0, i64 9
+  store i16 %119, i16* %120, align 2
+  %121 = getelementptr inbounds i8, i8* %1, i64 10
+  %122 = load i8, i8* %121, align 1
+  %123 = zext i8 %122 to i32
+  %124 = getelementptr inbounds i8, i8* %2, i64 10
+  %125 = load i8, i8* %124, align 1
+  %126 = zext i8 %125 to i32
+  %127 = add nuw nsw i32 %126, %123
+  %128 = lshr i32 %127, 1
+  %129 = add nuw nsw i32 %128, %127
+  %130 = lshr i32 %129, 2
+  %131 = trunc i32 %130 to i16
+  %132 = getelementptr inbounds i16, i16* %0, i64 10
+  store i16 %131, i16* %132, align 2
+  %133 = getelementptr inbounds i8, i8* %1, i64 11
+  %134 = load i8, i8* %133, align 1
+  %135 = zext i8 %134 to i32
+  %136 = getelementptr inbounds i8, i8* %2, i64 11
+  %137 = load i8, i8* %136, align 1
+  %138 = zext i8 %137 to i32
+  %139 = add nuw nsw i32 %138, %135
+  %140 = lshr i32 %139, 1
+  %141 = add nuw nsw i32 %140, %139
+  %142 = lshr i32 %141, 2
+  %143 = trunc i32 %142 to i16
+  %144 = getelementptr inbounds i16, i16* %0, i64 11
+  store i16 %143, i16* %144, align 2
+  %145 = getelementptr inbounds i8, i8* %1, i64 12
+  %146 = load i8, i8* %145, align 1
+  %147 = zext i8 %146 to i32
+  %148 = getelementptr inbounds i8, i8* %2, i64 12
+  %149 = load i8, i8* %148, align 1
+  %150 = zext i8 %149 to i32
+  %151 = add nuw nsw i32 %150, %147
+  %152 = lshr i32 %151, 1
+  %153 = add nuw nsw i32 %152, %151
+  %154 = lshr i32 %153, 2
+  %155 = trunc i32 %154 to i16
+  %156 = getelementptr inbounds i16, i16* %0, i64 12
+  store i16 %155, i16* %156, align 2
+  %157 = getelementptr inbounds i8, i8* %1, i64 13
+  %158 = load i8, i8* %157, align 1
+  %159 = zext i8 %158 to i32
+  %160 = getelementptr inbounds i8, i8* %2, i64 13
+  %161 = load i8, i8* %160, align 1
+  %162 = zext i8 %161 to i32
+  %163 = add nuw nsw i32 %162, %159
+  %164 = lshr i32 %163, 1
+  %165 = add nuw nsw i32 %164, %163
+  %166 = lshr i32 %165, 2
+  %167 = trunc i32 %166 to i16
+  %168 = getelementptr inbounds i16, i16* %0, i64 13
+  store i16 %167, i16* %168, align 2
+  %169 = getelementptr inbounds i8, i8* %1, i64 14
+  %170 = load i8, i8* %169, align 1
+  %171 = zext i8 %170 to i32
+  %172 = getelementptr inbounds i8, i8* %2, i64 14
+  %173 = load i8, i8* %172, align 1
+  %174 = zext i8 %173 to i32
+  %175 = add nuw nsw i32 %174, %171
+  %176 = lshr i32 %175, 1
+  %177 = add nuw nsw i32 %176, %175
+  %178 = lshr i32 %177, 2
+  %179 = trunc i32 %178 to i16
+  %180 = getelementptr inbounds i16, i16* %0, i64 14
+  store i16 %179, i16* %180, align 2
+  %181 = getelementptr inbounds i8, i8* %1, i64 15
+  %182 = load i8, i8* %181, align 1
+  %183 = zext i8 %182 to i32
+  %184 = getelementptr inbounds i8, i8* %2, i64 15
+  %185 = load i8, i8* %184, align 1
+  %186 = zext i8 %185 to i32
+  %187 = add nuw nsw i32 %186, %183
+  %188 = lshr i32 %187, 1
+  %189 = add nuw nsw i32 %188, %187
+  %190 = lshr i32 %189, 2
+  %191 = trunc i32 %190 to i16
+  %192 = getelementptr inbounds i16, i16* %0, i64 15
+  store i16 %191, i16* %192, align 2
+  ret void
+}