[llvm] 96d2d25 - [SLP][X86] Add test coverage for PR47491 / Issue #46835

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sun May 8 03:24:56 PDT 2022


Author: Simon Pilgrim
Date: 2022-05-08T11:24:46+01:00
New Revision: 96d2d2508e4d71062eb3cf72ece556d3ec1371cc

URL: https://github.com/llvm/llvm-project/commit/96d2d2508e4d71062eb3cf72ece556d3ec1371cc
DIFF: https://github.com/llvm/llvm-project/commit/96d2d2508e4d71062eb3cf72ece556d3ec1371cc.diff

LOG: [SLP][X86] Add test coverage for PR47491 / Issue #46835

D124284 should help us vectorize the sub-128-bit vector cases

Added: 
    llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll
new file mode 100644
index 0000000000000..d76a4c5034f04
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll
@@ -0,0 +1,574 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64    -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+
+; // PR47491
+; void pr(char* r, char* a){
+;   for (int i = 0; i < 8; i++){
+;       r[i] += a[i];
+;   }
+; }
+
+define void @add4(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
+; SSE-LABEL: @add4(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store i8 [[ADD]], ptr [[R]], align 1
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; SSE-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
+; SSE-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[ADD_1:%.*]] = add i8 [[TMP3]], [[TMP2]]
+; SSE-NEXT:    store i8 [[ADD_1]], ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; SSE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
+; SSE-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[ADD_2:%.*]] = add i8 [[TMP5]], [[TMP4]]
+; SSE-NEXT:    store i8 [[ADD_2]], ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
+; SSE-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; SSE-NEXT:    [[ADD_3:%.*]] = add i8 [[TMP7]], [[TMP6]]
+; SSE-NEXT:    store i8 [[ADD_3]], ptr [[ARRAYIDX2_3]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add4(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store i8 [[ADD]], ptr [[R]], align 1
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; AVX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
+; AVX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; AVX-NEXT:    [[ADD_1:%.*]] = add i8 [[TMP3]], [[TMP2]]
+; AVX-NEXT:    store i8 [[ADD_1]], ptr [[ARRAYIDX2_1]], align 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; AVX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
+; AVX-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[ADD_2:%.*]] = add i8 [[TMP5]], [[TMP4]]
+; AVX-NEXT:    store i8 [[ADD_2]], ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
+; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; AVX-NEXT:    [[ADD_3:%.*]] = add i8 [[TMP7]], [[TMP6]]
+; AVX-NEXT:    store i8 [[ADD_3]], ptr [[ARRAYIDX2_3]], align 1
+; AVX-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %a, align 1
+  %1 = load i8, ptr %r, align 1
+  %add = add i8 %1, %0
+  store i8 %add, ptr %r, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
+  %2 = load i8, ptr %arrayidx.1, align 1
+  %arrayidx2.1 = getelementptr inbounds i8, ptr %r, i64 1
+  %3 = load i8, ptr %arrayidx2.1, align 1
+  %add.1 = add i8 %3, %2
+  store i8 %add.1, ptr %arrayidx2.1, align 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 2
+  %4 = load i8, ptr %arrayidx.2, align 1
+  %arrayidx2.2 = getelementptr inbounds i8, ptr %r, i64 2
+  %5 = load i8, ptr %arrayidx2.2, align 1
+  %add.2 = add i8 %5, %4
+  store i8 %add.2, ptr %arrayidx2.2, align 1
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 3
+  %6 = load i8, ptr %arrayidx.3, align 1
+  %arrayidx2.3 = getelementptr inbounds i8, ptr %r, i64 3
+  %7 = load i8, ptr %arrayidx2.3, align 1
+  %add.3 = add i8 %7, %6
+  store i8 %add.3, ptr %arrayidx2.3, align 1
+  ret void
+}
+
+define void @add8(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
+; SSE-LABEL: @add8(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store i8 [[ADD]], ptr [[R]], align 1
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; SSE-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
+; SSE-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[ADD_1:%.*]] = add i8 [[TMP3]], [[TMP2]]
+; SSE-NEXT:    store i8 [[ADD_1]], ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; SSE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
+; SSE-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[ADD_2:%.*]] = add i8 [[TMP5]], [[TMP4]]
+; SSE-NEXT:    store i8 [[ADD_2]], ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
+; SSE-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; SSE-NEXT:    [[ADD_3:%.*]] = add i8 [[TMP7]], [[TMP6]]
+; SSE-NEXT:    store i8 [[ADD_3]], ptr [[ARRAYIDX2_3]], align 1
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; SSE-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 4
+; SSE-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
+; SSE-NEXT:    [[ADD_4:%.*]] = add i8 [[TMP9]], [[TMP8]]
+; SSE-NEXT:    store i8 [[ADD_4]], ptr [[ARRAYIDX2_4]], align 1
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5
+; SSE-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 5
+; SSE-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
+; SSE-NEXT:    [[ADD_5:%.*]] = add i8 [[TMP11]], [[TMP10]]
+; SSE-NEXT:    store i8 [[ADD_5]], ptr [[ARRAYIDX2_5]], align 1
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6
+; SSE-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 6
+; SSE-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
+; SSE-NEXT:    [[ADD_6:%.*]] = add i8 [[TMP13]], [[TMP12]]
+; SSE-NEXT:    store i8 [[ADD_6]], ptr [[ARRAYIDX2_6]], align 1
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7
+; SSE-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 7
+; SSE-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
+; SSE-NEXT:    [[ADD_7:%.*]] = add i8 [[TMP15]], [[TMP14]]
+; SSE-NEXT:    store i8 [[ADD_7]], ptr [[ARRAYIDX2_7]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add8(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store i8 [[ADD]], ptr [[R]], align 1
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; AVX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
+; AVX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; AVX-NEXT:    [[ADD_1:%.*]] = add i8 [[TMP3]], [[TMP2]]
+; AVX-NEXT:    store i8 [[ADD_1]], ptr [[ARRAYIDX2_1]], align 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; AVX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
+; AVX-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[ADD_2:%.*]] = add i8 [[TMP5]], [[TMP4]]
+; AVX-NEXT:    store i8 [[ADD_2]], ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
+; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; AVX-NEXT:    [[ADD_3:%.*]] = add i8 [[TMP7]], [[TMP6]]
+; AVX-NEXT:    store i8 [[ADD_3]], ptr [[ARRAYIDX2_3]], align 1
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; AVX-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 4
+; AVX-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
+; AVX-NEXT:    [[ADD_4:%.*]] = add i8 [[TMP9]], [[TMP8]]
+; AVX-NEXT:    store i8 [[ADD_4]], ptr [[ARRAYIDX2_4]], align 1
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5
+; AVX-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 5
+; AVX-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
+; AVX-NEXT:    [[ADD_5:%.*]] = add i8 [[TMP11]], [[TMP10]]
+; AVX-NEXT:    store i8 [[ADD_5]], ptr [[ARRAYIDX2_5]], align 1
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6
+; AVX-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 6
+; AVX-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
+; AVX-NEXT:    [[ADD_6:%.*]] = add i8 [[TMP13]], [[TMP12]]
+; AVX-NEXT:    store i8 [[ADD_6]], ptr [[ARRAYIDX2_6]], align 1
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7
+; AVX-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 7
+; AVX-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
+; AVX-NEXT:    [[ADD_7:%.*]] = add i8 [[TMP15]], [[TMP14]]
+; AVX-NEXT:    store i8 [[ADD_7]], ptr [[ARRAYIDX2_7]], align 1
+; AVX-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %a, align 1
+  %1 = load i8, ptr %r, align 1
+  %add = add i8 %1, %0
+  store i8 %add, ptr %r, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
+  %2 = load i8, ptr %arrayidx.1, align 1
+  %arrayidx2.1 = getelementptr inbounds i8, ptr %r, i64 1
+  %3 = load i8, ptr %arrayidx2.1, align 1
+  %add.1 = add i8 %3, %2
+  store i8 %add.1, ptr %arrayidx2.1, align 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 2
+  %4 = load i8, ptr %arrayidx.2, align 1
+  %arrayidx2.2 = getelementptr inbounds i8, ptr %r, i64 2
+  %5 = load i8, ptr %arrayidx2.2, align 1
+  %add.2 = add i8 %5, %4
+  store i8 %add.2, ptr %arrayidx2.2, align 1
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 3
+  %6 = load i8, ptr %arrayidx.3, align 1
+  %arrayidx2.3 = getelementptr inbounds i8, ptr %r, i64 3
+  %7 = load i8, ptr %arrayidx2.3, align 1
+  %add.3 = add i8 %7, %6
+  store i8 %add.3, ptr %arrayidx2.3, align 1
+  %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 4
+  %8 = load i8, ptr %arrayidx.4, align 1
+  %arrayidx2.4 = getelementptr inbounds i8, ptr %r, i64 4
+  %9 = load i8, ptr %arrayidx2.4, align 1
+  %add.4 = add i8 %9, %8
+  store i8 %add.4, ptr %arrayidx2.4, align 1
+  %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 5
+  %10 = load i8, ptr %arrayidx.5, align 1
+  %arrayidx2.5 = getelementptr inbounds i8, ptr %r, i64 5
+  %11 = load i8, ptr %arrayidx2.5, align 1
+  %add.5 = add i8 %11, %10
+  store i8 %add.5, ptr %arrayidx2.5, align 1
+  %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 6
+  %12 = load i8, ptr %arrayidx.6, align 1
+  %arrayidx2.6 = getelementptr inbounds i8, ptr %r, i64 6
+  %13 = load i8, ptr %arrayidx2.6, align 1
+  %add.6 = add i8 %13, %12
+  store i8 %add.6, ptr %arrayidx2.6, align 1
+  %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 7
+  %14 = load i8, ptr %arrayidx.7, align 1
+  %arrayidx2.7 = getelementptr inbounds i8, ptr %r, i64 7
+  %15 = load i8, ptr %arrayidx2.7, align 1
+  %add.7 = add i8 %15, %14
+  store i8 %add.7, ptr %arrayidx2.7, align 1
+  ret void
+}
+
+define void @add16(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
+; SSE-LABEL: @add16(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[TMP2:%.*]] = add <16 x i8> [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store <16 x i8> [[TMP2]], ptr [[R]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add16(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = add <16 x i8> [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store <16 x i8> [[TMP2]], ptr [[R]], align 1
+; AVX-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %a, align 1
+  %1 = load i8, ptr %r, align 1
+  %add = add i8 %1, %0
+  store i8 %add, ptr %r, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
+  %2 = load i8, ptr %arrayidx.1, align 1
+  %arrayidx2.1 = getelementptr inbounds i8, ptr %r, i64 1
+  %3 = load i8, ptr %arrayidx2.1, align 1
+  %add.1 = add i8 %3, %2
+  store i8 %add.1, ptr %arrayidx2.1, align 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 2
+  %4 = load i8, ptr %arrayidx.2, align 1
+  %arrayidx2.2 = getelementptr inbounds i8, ptr %r, i64 2
+  %5 = load i8, ptr %arrayidx2.2, align 1
+  %add.2 = add i8 %5, %4
+  store i8 %add.2, ptr %arrayidx2.2, align 1
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 3
+  %6 = load i8, ptr %arrayidx.3, align 1
+  %arrayidx2.3 = getelementptr inbounds i8, ptr %r, i64 3
+  %7 = load i8, ptr %arrayidx2.3, align 1
+  %add.3 = add i8 %7, %6
+  store i8 %add.3, ptr %arrayidx2.3, align 1
+  %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 4
+  %8 = load i8, ptr %arrayidx.4, align 1
+  %arrayidx2.4 = getelementptr inbounds i8, ptr %r, i64 4
+  %9 = load i8, ptr %arrayidx2.4, align 1
+  %add.4 = add i8 %9, %8
+  store i8 %add.4, ptr %arrayidx2.4, align 1
+  %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 5
+  %10 = load i8, ptr %arrayidx.5, align 1
+  %arrayidx2.5 = getelementptr inbounds i8, ptr %r, i64 5
+  %11 = load i8, ptr %arrayidx2.5, align 1
+  %add.5 = add i8 %11, %10
+  store i8 %add.5, ptr %arrayidx2.5, align 1
+  %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 6
+  %12 = load i8, ptr %arrayidx.6, align 1
+  %arrayidx2.6 = getelementptr inbounds i8, ptr %r, i64 6
+  %13 = load i8, ptr %arrayidx2.6, align 1
+  %add.6 = add i8 %13, %12
+  store i8 %add.6, ptr %arrayidx2.6, align 1
+  %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 7
+  %14 = load i8, ptr %arrayidx.7, align 1
+  %arrayidx2.7 = getelementptr inbounds i8, ptr %r, i64 7
+  %15 = load i8, ptr %arrayidx2.7, align 1
+  %add.7 = add i8 %15, %14
+  store i8 %add.7, ptr %arrayidx2.7, align 1
+  %arrayidx.8 = getelementptr inbounds i8, ptr %a, i64 8
+  %16 = load i8, ptr %arrayidx.8, align 1
+  %arrayidx2.8 = getelementptr inbounds i8, ptr %r, i64 8
+  %17 = load i8, ptr %arrayidx2.8, align 1
+  %add.8 = add i8 %17, %16
+  store i8 %add.8, ptr %arrayidx2.8, align 1
+  %arrayidx.9 = getelementptr inbounds i8, ptr %a, i64 9
+  %18 = load i8, ptr %arrayidx.9, align 1
+  %arrayidx2.9 = getelementptr inbounds i8, ptr %r, i64 9
+  %19 = load i8, ptr %arrayidx2.9, align 1
+  %add.9 = add i8 %19, %18
+  store i8 %add.9, ptr %arrayidx2.9, align 1
+  %arrayidx.10 = getelementptr inbounds i8, ptr %a, i64 10
+  %20 = load i8, ptr %arrayidx.10, align 1
+  %arrayidx2.10 = getelementptr inbounds i8, ptr %r, i64 10
+  %21 = load i8, ptr %arrayidx2.10, align 1
+  %add.10 = add i8 %21, %20
+  store i8 %add.10, ptr %arrayidx2.10, align 1
+  %arrayidx.11 = getelementptr inbounds i8, ptr %a, i64 11
+  %22 = load i8, ptr %arrayidx.11, align 1
+  %arrayidx2.11 = getelementptr inbounds i8, ptr %r, i64 11
+  %23 = load i8, ptr %arrayidx2.11, align 1
+  %add.11 = add i8 %23, %22
+  store i8 %add.11, ptr %arrayidx2.11, align 1
+  %arrayidx.12 = getelementptr inbounds i8, ptr %a, i64 12
+  %24 = load i8, ptr %arrayidx.12, align 1
+  %arrayidx2.12 = getelementptr inbounds i8, ptr %r, i64 12
+  %25 = load i8, ptr %arrayidx2.12, align 1
+  %add.12 = add i8 %25, %24
+  store i8 %add.12, ptr %arrayidx2.12, align 1
+  %arrayidx.13 = getelementptr inbounds i8, ptr %a, i64 13
+  %26 = load i8, ptr %arrayidx.13, align 1
+  %arrayidx2.13 = getelementptr inbounds i8, ptr %r, i64 13
+  %27 = load i8, ptr %arrayidx2.13, align 1
+  %add.13 = add i8 %27, %26
+  store i8 %add.13, ptr %arrayidx2.13, align 1
+  %arrayidx.14 = getelementptr inbounds i8, ptr %a, i64 14
+  %28 = load i8, ptr %arrayidx.14, align 1
+  %arrayidx2.14 = getelementptr inbounds i8, ptr %r, i64 14
+  %29 = load i8, ptr %arrayidx2.14, align 1
+  %add.14 = add i8 %29, %28
+  store i8 %add.14, ptr %arrayidx2.14, align 1
+  %arrayidx.15 = getelementptr inbounds i8, ptr %a, i64 15
+  %30 = load i8, ptr %arrayidx.15, align 1
+  %arrayidx2.15 = getelementptr inbounds i8, ptr %r, i64 15
+  %31 = load i8, ptr %arrayidx2.15, align 1
+  %add.15 = add i8 %31, %30
+  store i8 %add.15, ptr %arrayidx2.15, align 1
+  ret void
+}
+
+define void @add32(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
+; SSE-LABEL: @add32(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[TMP2:%.*]] = add <16 x i8> [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store <16 x i8> [[TMP2]], ptr [[R]], align 1
+; SSE-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
+; SSE-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX_16]], align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2_16]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[TMP4]], [[TMP3]]
+; SSE-NEXT:    store <16 x i8> [[TMP5]], ptr [[ARRAYIDX2_16]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add32(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = add <32 x i8> [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store <32 x i8> [[TMP2]], ptr [[R]], align 1
+; AVX-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %a, align 1
+  %1 = load i8, ptr %r, align 1
+  %add = add i8 %1, %0
+  store i8 %add, ptr %r, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
+  %2 = load i8, ptr %arrayidx.1, align 1
+  %arrayidx2.1 = getelementptr inbounds i8, ptr %r, i64 1
+  %3 = load i8, ptr %arrayidx2.1, align 1
+  %add.1 = add i8 %3, %2
+  store i8 %add.1, ptr %arrayidx2.1, align 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 2
+  %4 = load i8, ptr %arrayidx.2, align 1
+  %arrayidx2.2 = getelementptr inbounds i8, ptr %r, i64 2
+  %5 = load i8, ptr %arrayidx2.2, align 1
+  %add.2 = add i8 %5, %4
+  store i8 %add.2, ptr %arrayidx2.2, align 1
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 3
+  %6 = load i8, ptr %arrayidx.3, align 1
+  %arrayidx2.3 = getelementptr inbounds i8, ptr %r, i64 3
+  %7 = load i8, ptr %arrayidx2.3, align 1
+  %add.3 = add i8 %7, %6
+  store i8 %add.3, ptr %arrayidx2.3, align 1
+  %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 4
+  %8 = load i8, ptr %arrayidx.4, align 1
+  %arrayidx2.4 = getelementptr inbounds i8, ptr %r, i64 4
+  %9 = load i8, ptr %arrayidx2.4, align 1
+  %add.4 = add i8 %9, %8
+  store i8 %add.4, ptr %arrayidx2.4, align 1
+  %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 5
+  %10 = load i8, ptr %arrayidx.5, align 1
+  %arrayidx2.5 = getelementptr inbounds i8, ptr %r, i64 5
+  %11 = load i8, ptr %arrayidx2.5, align 1
+  %add.5 = add i8 %11, %10
+  store i8 %add.5, ptr %arrayidx2.5, align 1
+  %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 6
+  %12 = load i8, ptr %arrayidx.6, align 1
+  %arrayidx2.6 = getelementptr inbounds i8, ptr %r, i64 6
+  %13 = load i8, ptr %arrayidx2.6, align 1
+  %add.6 = add i8 %13, %12
+  store i8 %add.6, ptr %arrayidx2.6, align 1
+  %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 7
+  %14 = load i8, ptr %arrayidx.7, align 1
+  %arrayidx2.7 = getelementptr inbounds i8, ptr %r, i64 7
+  %15 = load i8, ptr %arrayidx2.7, align 1
+  %add.7 = add i8 %15, %14
+  store i8 %add.7, ptr %arrayidx2.7, align 1
+  %arrayidx.8 = getelementptr inbounds i8, ptr %a, i64 8
+  %16 = load i8, ptr %arrayidx.8, align 1
+  %arrayidx2.8 = getelementptr inbounds i8, ptr %r, i64 8
+  %17 = load i8, ptr %arrayidx2.8, align 1
+  %add.8 = add i8 %17, %16
+  store i8 %add.8, ptr %arrayidx2.8, align 1
+  %arrayidx.9 = getelementptr inbounds i8, ptr %a, i64 9
+  %18 = load i8, ptr %arrayidx.9, align 1
+  %arrayidx2.9 = getelementptr inbounds i8, ptr %r, i64 9
+  %19 = load i8, ptr %arrayidx2.9, align 1
+  %add.9 = add i8 %19, %18
+  store i8 %add.9, ptr %arrayidx2.9, align 1
+  %arrayidx.10 = getelementptr inbounds i8, ptr %a, i64 10
+  %20 = load i8, ptr %arrayidx.10, align 1
+  %arrayidx2.10 = getelementptr inbounds i8, ptr %r, i64 10
+  %21 = load i8, ptr %arrayidx2.10, align 1
+  %add.10 = add i8 %21, %20
+  store i8 %add.10, ptr %arrayidx2.10, align 1
+  %arrayidx.11 = getelementptr inbounds i8, ptr %a, i64 11
+  %22 = load i8, ptr %arrayidx.11, align 1
+  %arrayidx2.11 = getelementptr inbounds i8, ptr %r, i64 11
+  %23 = load i8, ptr %arrayidx2.11, align 1
+  %add.11 = add i8 %23, %22
+  store i8 %add.11, ptr %arrayidx2.11, align 1
+  %arrayidx.12 = getelementptr inbounds i8, ptr %a, i64 12
+  %24 = load i8, ptr %arrayidx.12, align 1
+  %arrayidx2.12 = getelementptr inbounds i8, ptr %r, i64 12
+  %25 = load i8, ptr %arrayidx2.12, align 1
+  %add.12 = add i8 %25, %24
+  store i8 %add.12, ptr %arrayidx2.12, align 1
+  %arrayidx.13 = getelementptr inbounds i8, ptr %a, i64 13
+  %26 = load i8, ptr %arrayidx.13, align 1
+  %arrayidx2.13 = getelementptr inbounds i8, ptr %r, i64 13
+  %27 = load i8, ptr %arrayidx2.13, align 1
+  %add.13 = add i8 %27, %26
+  store i8 %add.13, ptr %arrayidx2.13, align 1
+  %arrayidx.14 = getelementptr inbounds i8, ptr %a, i64 14
+  %28 = load i8, ptr %arrayidx.14, align 1
+  %arrayidx2.14 = getelementptr inbounds i8, ptr %r, i64 14
+  %29 = load i8, ptr %arrayidx2.14, align 1
+  %add.14 = add i8 %29, %28
+  store i8 %add.14, ptr %arrayidx2.14, align 1
+  %arrayidx.15 = getelementptr inbounds i8, ptr %a, i64 15
+  %30 = load i8, ptr %arrayidx.15, align 1
+  %arrayidx2.15 = getelementptr inbounds i8, ptr %r, i64 15
+  %31 = load i8, ptr %arrayidx2.15, align 1
+  %add.15 = add i8 %31, %30
+  store i8 %add.15, ptr %arrayidx2.15, align 1
+  %arrayidx.16 = getelementptr inbounds i8, ptr %a, i64 16
+  %32 = load i8, ptr %arrayidx.16, align 1
+  %arrayidx2.16 = getelementptr inbounds i8, ptr %r, i64 16
+  %33 = load i8, ptr %arrayidx2.16, align 1
+  %add.16 = add i8 %33, %32
+  store i8 %add.16, ptr %arrayidx2.16, align 1
+  %arrayidx.17 = getelementptr inbounds i8, ptr %a, i64 17
+  %34 = load i8, ptr %arrayidx.17, align 1
+  %arrayidx2.17 = getelementptr inbounds i8, ptr %r, i64 17
+  %35 = load i8, ptr %arrayidx2.17, align 1
+  %add.17 = add i8 %35, %34
+  store i8 %add.17, ptr %arrayidx2.17, align 1
+  %arrayidx.18 = getelementptr inbounds i8, ptr %a, i64 18
+  %36 = load i8, ptr %arrayidx.18, align 1
+  %arrayidx2.18 = getelementptr inbounds i8, ptr %r, i64 18
+  %37 = load i8, ptr %arrayidx2.18, align 1
+  %add.18 = add i8 %37, %36
+  store i8 %add.18, ptr %arrayidx2.18, align 1
+  %arrayidx.19 = getelementptr inbounds i8, ptr %a, i64 19
+  %38 = load i8, ptr %arrayidx.19, align 1
+  %arrayidx2.19 = getelementptr inbounds i8, ptr %r, i64 19
+  %39 = load i8, ptr %arrayidx2.19, align 1
+  %add.19 = add i8 %39, %38
+  store i8 %add.19, ptr %arrayidx2.19, align 1
+  %arrayidx.20 = getelementptr inbounds i8, ptr %a, i64 20
+  %40 = load i8, ptr %arrayidx.20, align 1
+  %arrayidx2.20 = getelementptr inbounds i8, ptr %r, i64 20
+  %41 = load i8, ptr %arrayidx2.20, align 1
+  %add.20 = add i8 %41, %40
+  store i8 %add.20, ptr %arrayidx2.20, align 1
+  %arrayidx.21 = getelementptr inbounds i8, ptr %a, i64 21
+  %42 = load i8, ptr %arrayidx.21, align 1
+  %arrayidx2.21 = getelementptr inbounds i8, ptr %r, i64 21
+  %43 = load i8, ptr %arrayidx2.21, align 1
+  %add.21 = add i8 %43, %42
+  store i8 %add.21, ptr %arrayidx2.21, align 1
+  %arrayidx.22 = getelementptr inbounds i8, ptr %a, i64 22
+  %44 = load i8, ptr %arrayidx.22, align 1
+  %arrayidx2.22 = getelementptr inbounds i8, ptr %r, i64 22
+  %45 = load i8, ptr %arrayidx2.22, align 1
+  %add.22 = add i8 %45, %44
+  store i8 %add.22, ptr %arrayidx2.22, align 1
+  %arrayidx.23 = getelementptr inbounds i8, ptr %a, i64 23
+  %46 = load i8, ptr %arrayidx.23, align 1
+  %arrayidx2.23 = getelementptr inbounds i8, ptr %r, i64 23
+  %47 = load i8, ptr %arrayidx2.23, align 1
+  %add.23 = add i8 %47, %46
+  store i8 %add.23, ptr %arrayidx2.23, align 1
+  %arrayidx.24 = getelementptr inbounds i8, ptr %a, i64 24
+  %48 = load i8, ptr %arrayidx.24, align 1
+  %arrayidx2.24 = getelementptr inbounds i8, ptr %r, i64 24
+  %49 = load i8, ptr %arrayidx2.24, align 1
+  %add.24 = add i8 %49, %48
+  store i8 %add.24, ptr %arrayidx2.24, align 1
+  %arrayidx.25 = getelementptr inbounds i8, ptr %a, i64 25
+  %50 = load i8, ptr %arrayidx.25, align 1
+  %arrayidx2.25 = getelementptr inbounds i8, ptr %r, i64 25
+  %51 = load i8, ptr %arrayidx2.25, align 1
+  %add.25 = add i8 %51, %50
+  store i8 %add.25, ptr %arrayidx2.25, align 1
+  %arrayidx.26 = getelementptr inbounds i8, ptr %a, i64 26
+  %52 = load i8, ptr %arrayidx.26, align 1
+  %arrayidx2.26 = getelementptr inbounds i8, ptr %r, i64 26
+  %53 = load i8, ptr %arrayidx2.26, align 1
+  %add.26 = add i8 %53, %52
+  store i8 %add.26, ptr %arrayidx2.26, align 1
+  %arrayidx.27 = getelementptr inbounds i8, ptr %a, i64 27
+  %54 = load i8, ptr %arrayidx.27, align 1
+  %arrayidx2.27 = getelementptr inbounds i8, ptr %r, i64 27
+  %55 = load i8, ptr %arrayidx2.27, align 1
+  %add.27 = add i8 %55, %54
+  store i8 %add.27, ptr %arrayidx2.27, align 1
+  %arrayidx.28 = getelementptr inbounds i8, ptr %a, i64 28
+  %56 = load i8, ptr %arrayidx.28, align 1
+  %arrayidx2.28 = getelementptr inbounds i8, ptr %r, i64 28
+  %57 = load i8, ptr %arrayidx2.28, align 1
+  %add.28 = add i8 %57, %56
+  store i8 %add.28, ptr %arrayidx2.28, align 1
+  %arrayidx.29 = getelementptr inbounds i8, ptr %a, i64 29
+  %58 = load i8, ptr %arrayidx.29, align 1
+  %arrayidx2.29 = getelementptr inbounds i8, ptr %r, i64 29
+  %59 = load i8, ptr %arrayidx2.29, align 1
+  %add.29 = add i8 %59, %58
+  store i8 %add.29, ptr %arrayidx2.29, align 1
+  %arrayidx.30 = getelementptr inbounds i8, ptr %a, i64 30
+  %60 = load i8, ptr %arrayidx.30, align 1
+  %arrayidx2.30 = getelementptr inbounds i8, ptr %r, i64 30
+  %61 = load i8, ptr %arrayidx2.30, align 1
+  %add.30 = add i8 %61, %60
+  store i8 %add.30, ptr %arrayidx2.30, align 1
+  %arrayidx.31 = getelementptr inbounds i8, ptr %a, i64 31
+  %62 = load i8, ptr %arrayidx.31, align 1
+  %arrayidx2.31 = getelementptr inbounds i8, ptr %r, i64 31
+  %63 = load i8, ptr %arrayidx2.31, align 1
+  %add.31 = add i8 %63, %62
+  store i8 %add.31, ptr %arrayidx2.31, align 1
+  ret void
+}

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll
new file mode 100644
index 0000000000000..42463832c8831
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll
@@ -0,0 +1,574 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64    -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+
+; // PR47491
+; void pr(char* r, char* a){
+;   for (int i = 0; i < 8; i++){
+;       r[i] *= a[i];
+;   }
+; }
+
+define void @add4(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
+; SSE-LABEL: @add4(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[MUL:%.*]] = mul i8 [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store i8 [[MUL]], ptr [[R]], align 1
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; SSE-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
+; SSE-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[MUL_1:%.*]] = mul i8 [[TMP3]], [[TMP2]]
+; SSE-NEXT:    store i8 [[MUL_1]], ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; SSE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
+; SSE-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[MUL_2:%.*]] = mul i8 [[TMP5]], [[TMP4]]
+; SSE-NEXT:    store i8 [[MUL_2]], ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
+; SSE-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; SSE-NEXT:    [[MUL_3:%.*]] = mul i8 [[TMP7]], [[TMP6]]
+; SSE-NEXT:    store i8 [[MUL_3]], ptr [[ARRAYIDX2_3]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add4(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[MUL:%.*]] = mul i8 [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store i8 [[MUL]], ptr [[R]], align 1
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; AVX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
+; AVX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; AVX-NEXT:    [[MUL_1:%.*]] = mul i8 [[TMP3]], [[TMP2]]
+; AVX-NEXT:    store i8 [[MUL_1]], ptr [[ARRAYIDX2_1]], align 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; AVX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
+; AVX-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[MUL_2:%.*]] = mul i8 [[TMP5]], [[TMP4]]
+; AVX-NEXT:    store i8 [[MUL_2]], ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
+; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; AVX-NEXT:    [[MUL_3:%.*]] = mul i8 [[TMP7]], [[TMP6]]
+; AVX-NEXT:    store i8 [[MUL_3]], ptr [[ARRAYIDX2_3]], align 1
+; AVX-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %a, align 1
+  %1 = load i8, ptr %r, align 1
+  %mul = mul i8 %1, %0
+  store i8 %mul, ptr %r, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
+  %2 = load i8, ptr %arrayidx.1, align 1
+  %arrayidx2.1 = getelementptr inbounds i8, ptr %r, i64 1
+  %3 = load i8, ptr %arrayidx2.1, align 1
+  %mul.1 = mul i8 %3, %2
+  store i8 %mul.1, ptr %arrayidx2.1, align 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 2
+  %4 = load i8, ptr %arrayidx.2, align 1
+  %arrayidx2.2 = getelementptr inbounds i8, ptr %r, i64 2
+  %5 = load i8, ptr %arrayidx2.2, align 1
+  %mul.2 = mul i8 %5, %4
+  store i8 %mul.2, ptr %arrayidx2.2, align 1
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 3
+  %6 = load i8, ptr %arrayidx.3, align 1
+  %arrayidx2.3 = getelementptr inbounds i8, ptr %r, i64 3
+  %7 = load i8, ptr %arrayidx2.3, align 1
+  %mul.3 = mul i8 %7, %6
+  store i8 %mul.3, ptr %arrayidx2.3, align 1
+  ret void
+}
+
+define void @add8(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
+; SSE-LABEL: @add8(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[MUL:%.*]] = mul i8 [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store i8 [[MUL]], ptr [[R]], align 1
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; SSE-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
+; SSE-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[MUL_1:%.*]] = mul i8 [[TMP3]], [[TMP2]]
+; SSE-NEXT:    store i8 [[MUL_1]], ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; SSE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
+; SSE-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[MUL_2:%.*]] = mul i8 [[TMP5]], [[TMP4]]
+; SSE-NEXT:    store i8 [[MUL_2]], ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
+; SSE-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; SSE-NEXT:    [[MUL_3:%.*]] = mul i8 [[TMP7]], [[TMP6]]
+; SSE-NEXT:    store i8 [[MUL_3]], ptr [[ARRAYIDX2_3]], align 1
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; SSE-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 4
+; SSE-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
+; SSE-NEXT:    [[MUL_4:%.*]] = mul i8 [[TMP9]], [[TMP8]]
+; SSE-NEXT:    store i8 [[MUL_4]], ptr [[ARRAYIDX2_4]], align 1
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5
+; SSE-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 5
+; SSE-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
+; SSE-NEXT:    [[MUL_5:%.*]] = mul i8 [[TMP11]], [[TMP10]]
+; SSE-NEXT:    store i8 [[MUL_5]], ptr [[ARRAYIDX2_5]], align 1
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6
+; SSE-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 6
+; SSE-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
+; SSE-NEXT:    [[MUL_6:%.*]] = mul i8 [[TMP13]], [[TMP12]]
+; SSE-NEXT:    store i8 [[MUL_6]], ptr [[ARRAYIDX2_6]], align 1
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7
+; SSE-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 7
+; SSE-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
+; SSE-NEXT:    [[MUL_7:%.*]] = mul i8 [[TMP15]], [[TMP14]]
+; SSE-NEXT:    store i8 [[MUL_7]], ptr [[ARRAYIDX2_7]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add8(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[MUL:%.*]] = mul i8 [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store i8 [[MUL]], ptr [[R]], align 1
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; AVX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
+; AVX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; AVX-NEXT:    [[MUL_1:%.*]] = mul i8 [[TMP3]], [[TMP2]]
+; AVX-NEXT:    store i8 [[MUL_1]], ptr [[ARRAYIDX2_1]], align 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; AVX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
+; AVX-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[MUL_2:%.*]] = mul i8 [[TMP5]], [[TMP4]]
+; AVX-NEXT:    store i8 [[MUL_2]], ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
+; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; AVX-NEXT:    [[MUL_3:%.*]] = mul i8 [[TMP7]], [[TMP6]]
+; AVX-NEXT:    store i8 [[MUL_3]], ptr [[ARRAYIDX2_3]], align 1
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; AVX-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 4
+; AVX-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
+; AVX-NEXT:    [[MUL_4:%.*]] = mul i8 [[TMP9]], [[TMP8]]
+; AVX-NEXT:    store i8 [[MUL_4]], ptr [[ARRAYIDX2_4]], align 1
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5
+; AVX-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 5
+; AVX-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
+; AVX-NEXT:    [[MUL_5:%.*]] = mul i8 [[TMP11]], [[TMP10]]
+; AVX-NEXT:    store i8 [[MUL_5]], ptr [[ARRAYIDX2_5]], align 1
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6
+; AVX-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 6
+; AVX-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
+; AVX-NEXT:    [[MUL_6:%.*]] = mul i8 [[TMP13]], [[TMP12]]
+; AVX-NEXT:    store i8 [[MUL_6]], ptr [[ARRAYIDX2_6]], align 1
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7
+; AVX-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
+; AVX-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 7
+; AVX-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
+; AVX-NEXT:    [[MUL_7:%.*]] = mul i8 [[TMP15]], [[TMP14]]
+; AVX-NEXT:    store i8 [[MUL_7]], ptr [[ARRAYIDX2_7]], align 1
+; AVX-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %a, align 1
+  %1 = load i8, ptr %r, align 1
+  %mul = mul i8 %1, %0
+  store i8 %mul, ptr %r, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
+  %2 = load i8, ptr %arrayidx.1, align 1
+  %arrayidx2.1 = getelementptr inbounds i8, ptr %r, i64 1
+  %3 = load i8, ptr %arrayidx2.1, align 1
+  %mul.1 = mul i8 %3, %2
+  store i8 %mul.1, ptr %arrayidx2.1, align 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 2
+  %4 = load i8, ptr %arrayidx.2, align 1
+  %arrayidx2.2 = getelementptr inbounds i8, ptr %r, i64 2
+  %5 = load i8, ptr %arrayidx2.2, align 1
+  %mul.2 = mul i8 %5, %4
+  store i8 %mul.2, ptr %arrayidx2.2, align 1
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 3
+  %6 = load i8, ptr %arrayidx.3, align 1
+  %arrayidx2.3 = getelementptr inbounds i8, ptr %r, i64 3
+  %7 = load i8, ptr %arrayidx2.3, align 1
+  %mul.3 = mul i8 %7, %6
+  store i8 %mul.3, ptr %arrayidx2.3, align 1
+  %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 4
+  %8 = load i8, ptr %arrayidx.4, align 1
+  %arrayidx2.4 = getelementptr inbounds i8, ptr %r, i64 4
+  %9 = load i8, ptr %arrayidx2.4, align 1
+  %mul.4 = mul i8 %9, %8
+  store i8 %mul.4, ptr %arrayidx2.4, align 1
+  %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 5
+  %10 = load i8, ptr %arrayidx.5, align 1
+  %arrayidx2.5 = getelementptr inbounds i8, ptr %r, i64 5
+  %11 = load i8, ptr %arrayidx2.5, align 1
+  %mul.5 = mul i8 %11, %10
+  store i8 %mul.5, ptr %arrayidx2.5, align 1
+  %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 6
+  %12 = load i8, ptr %arrayidx.6, align 1
+  %arrayidx2.6 = getelementptr inbounds i8, ptr %r, i64 6
+  %13 = load i8, ptr %arrayidx2.6, align 1
+  %mul.6 = mul i8 %13, %12
+  store i8 %mul.6, ptr %arrayidx2.6, align 1
+  %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 7
+  %14 = load i8, ptr %arrayidx.7, align 1
+  %arrayidx2.7 = getelementptr inbounds i8, ptr %r, i64 7
+  %15 = load i8, ptr %arrayidx2.7, align 1
+  %mul.7 = mul i8 %15, %14
+  store i8 %mul.7, ptr %arrayidx2.7, align 1
+  ret void
+}
+
+define void @add16(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
+; SSE-LABEL: @add16(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[TMP2:%.*]] = mul <16 x i8> [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store <16 x i8> [[TMP2]], ptr [[R]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add16(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = mul <16 x i8> [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store <16 x i8> [[TMP2]], ptr [[R]], align 1
+; AVX-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %a, align 1
+  %1 = load i8, ptr %r, align 1
+  %mul = mul i8 %1, %0
+  store i8 %mul, ptr %r, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
+  %2 = load i8, ptr %arrayidx.1, align 1
+  %arrayidx2.1 = getelementptr inbounds i8, ptr %r, i64 1
+  %3 = load i8, ptr %arrayidx2.1, align 1
+  %mul.1 = mul i8 %3, %2
+  store i8 %mul.1, ptr %arrayidx2.1, align 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 2
+  %4 = load i8, ptr %arrayidx.2, align 1
+  %arrayidx2.2 = getelementptr inbounds i8, ptr %r, i64 2
+  %5 = load i8, ptr %arrayidx2.2, align 1
+  %mul.2 = mul i8 %5, %4
+  store i8 %mul.2, ptr %arrayidx2.2, align 1
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 3
+  %6 = load i8, ptr %arrayidx.3, align 1
+  %arrayidx2.3 = getelementptr inbounds i8, ptr %r, i64 3
+  %7 = load i8, ptr %arrayidx2.3, align 1
+  %mul.3 = mul i8 %7, %6
+  store i8 %mul.3, ptr %arrayidx2.3, align 1
+  %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 4
+  %8 = load i8, ptr %arrayidx.4, align 1
+  %arrayidx2.4 = getelementptr inbounds i8, ptr %r, i64 4
+  %9 = load i8, ptr %arrayidx2.4, align 1
+  %mul.4 = mul i8 %9, %8
+  store i8 %mul.4, ptr %arrayidx2.4, align 1
+  %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 5
+  %10 = load i8, ptr %arrayidx.5, align 1
+  %arrayidx2.5 = getelementptr inbounds i8, ptr %r, i64 5
+  %11 = load i8, ptr %arrayidx2.5, align 1
+  %mul.5 = mul i8 %11, %10
+  store i8 %mul.5, ptr %arrayidx2.5, align 1
+  %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 6
+  %12 = load i8, ptr %arrayidx.6, align 1
+  %arrayidx2.6 = getelementptr inbounds i8, ptr %r, i64 6
+  %13 = load i8, ptr %arrayidx2.6, align 1
+  %mul.6 = mul i8 %13, %12
+  store i8 %mul.6, ptr %arrayidx2.6, align 1
+  %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 7
+  %14 = load i8, ptr %arrayidx.7, align 1
+  %arrayidx2.7 = getelementptr inbounds i8, ptr %r, i64 7
+  %15 = load i8, ptr %arrayidx2.7, align 1
+  %mul.7 = mul i8 %15, %14
+  store i8 %mul.7, ptr %arrayidx2.7, align 1
+  %arrayidx.8 = getelementptr inbounds i8, ptr %a, i64 8
+  %16 = load i8, ptr %arrayidx.8, align 1
+  %arrayidx2.8 = getelementptr inbounds i8, ptr %r, i64 8
+  %17 = load i8, ptr %arrayidx2.8, align 1
+  %mul.8 = mul i8 %17, %16
+  store i8 %mul.8, ptr %arrayidx2.8, align 1
+  %arrayidx.9 = getelementptr inbounds i8, ptr %a, i64 9
+  %18 = load i8, ptr %arrayidx.9, align 1
+  %arrayidx2.9 = getelementptr inbounds i8, ptr %r, i64 9
+  %19 = load i8, ptr %arrayidx2.9, align 1
+  %mul.9 = mul i8 %19, %18
+  store i8 %mul.9, ptr %arrayidx2.9, align 1
+  %arrayidx.10 = getelementptr inbounds i8, ptr %a, i64 10
+  %20 = load i8, ptr %arrayidx.10, align 1
+  %arrayidx2.10 = getelementptr inbounds i8, ptr %r, i64 10
+  %21 = load i8, ptr %arrayidx2.10, align 1
+  %mul.10 = mul i8 %21, %20
+  store i8 %mul.10, ptr %arrayidx2.10, align 1
+  %arrayidx.11 = getelementptr inbounds i8, ptr %a, i64 11
+  %22 = load i8, ptr %arrayidx.11, align 1
+  %arrayidx2.11 = getelementptr inbounds i8, ptr %r, i64 11
+  %23 = load i8, ptr %arrayidx2.11, align 1
+  %mul.11 = mul i8 %23, %22
+  store i8 %mul.11, ptr %arrayidx2.11, align 1
+  %arrayidx.12 = getelementptr inbounds i8, ptr %a, i64 12
+  %24 = load i8, ptr %arrayidx.12, align 1
+  %arrayidx2.12 = getelementptr inbounds i8, ptr %r, i64 12
+  %25 = load i8, ptr %arrayidx2.12, align 1
+  %mul.12 = mul i8 %25, %24
+  store i8 %mul.12, ptr %arrayidx2.12, align 1
+  %arrayidx.13 = getelementptr inbounds i8, ptr %a, i64 13
+  %26 = load i8, ptr %arrayidx.13, align 1
+  %arrayidx2.13 = getelementptr inbounds i8, ptr %r, i64 13
+  %27 = load i8, ptr %arrayidx2.13, align 1
+  %mul.13 = mul i8 %27, %26
+  store i8 %mul.13, ptr %arrayidx2.13, align 1
+  %arrayidx.14 = getelementptr inbounds i8, ptr %a, i64 14
+  %28 = load i8, ptr %arrayidx.14, align 1
+  %arrayidx2.14 = getelementptr inbounds i8, ptr %r, i64 14
+  %29 = load i8, ptr %arrayidx2.14, align 1
+  %mul.14 = mul i8 %29, %28
+  store i8 %mul.14, ptr %arrayidx2.14, align 1
+  %arrayidx.15 = getelementptr inbounds i8, ptr %a, i64 15
+  %30 = load i8, ptr %arrayidx.15, align 1
+  %arrayidx2.15 = getelementptr inbounds i8, ptr %r, i64 15
+  %31 = load i8, ptr %arrayidx2.15, align 1
+  %mul.15 = mul i8 %31, %30
+  store i8 %mul.15, ptr %arrayidx2.15, align 1
+  ret void
+}
+
+define void @add32(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
+; SSE-LABEL: @add32(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[TMP2:%.*]] = mul <16 x i8> [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store <16 x i8> [[TMP2]], ptr [[R]], align 1
+; SSE-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
+; SSE-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX_16]], align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2_16]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = mul <16 x i8> [[TMP4]], [[TMP3]]
+; SSE-NEXT:    store <16 x i8> [[TMP5]], ptr [[ARRAYIDX2_16]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add32(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = mul <32 x i8> [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store <32 x i8> [[TMP2]], ptr [[R]], align 1
+; AVX-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %a, align 1
+  %1 = load i8, ptr %r, align 1
+  %mul = mul i8 %1, %0
+  store i8 %mul, ptr %r, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
+  %2 = load i8, ptr %arrayidx.1, align 1
+  %arrayidx2.1 = getelementptr inbounds i8, ptr %r, i64 1
+  %3 = load i8, ptr %arrayidx2.1, align 1
+  %mul.1 = mul i8 %3, %2
+  store i8 %mul.1, ptr %arrayidx2.1, align 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 2
+  %4 = load i8, ptr %arrayidx.2, align 1
+  %arrayidx2.2 = getelementptr inbounds i8, ptr %r, i64 2
+  %5 = load i8, ptr %arrayidx2.2, align 1
+  %mul.2 = mul i8 %5, %4
+  store i8 %mul.2, ptr %arrayidx2.2, align 1
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 3
+  %6 = load i8, ptr %arrayidx.3, align 1
+  %arrayidx2.3 = getelementptr inbounds i8, ptr %r, i64 3
+  %7 = load i8, ptr %arrayidx2.3, align 1
+  %mul.3 = mul i8 %7, %6
+  store i8 %mul.3, ptr %arrayidx2.3, align 1
+  %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 4
+  %8 = load i8, ptr %arrayidx.4, align 1
+  %arrayidx2.4 = getelementptr inbounds i8, ptr %r, i64 4
+  %9 = load i8, ptr %arrayidx2.4, align 1
+  %mul.4 = mul i8 %9, %8
+  store i8 %mul.4, ptr %arrayidx2.4, align 1
+  %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 5
+  %10 = load i8, ptr %arrayidx.5, align 1
+  %arrayidx2.5 = getelementptr inbounds i8, ptr %r, i64 5
+  %11 = load i8, ptr %arrayidx2.5, align 1
+  %mul.5 = mul i8 %11, %10
+  store i8 %mul.5, ptr %arrayidx2.5, align 1
+  %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 6
+  %12 = load i8, ptr %arrayidx.6, align 1
+  %arrayidx2.6 = getelementptr inbounds i8, ptr %r, i64 6
+  %13 = load i8, ptr %arrayidx2.6, align 1
+  %mul.6 = mul i8 %13, %12
+  store i8 %mul.6, ptr %arrayidx2.6, align 1
+  %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 7
+  %14 = load i8, ptr %arrayidx.7, align 1
+  %arrayidx2.7 = getelementptr inbounds i8, ptr %r, i64 7
+  %15 = load i8, ptr %arrayidx2.7, align 1
+  %mul.7 = mul i8 %15, %14
+  store i8 %mul.7, ptr %arrayidx2.7, align 1
+  %arrayidx.8 = getelementptr inbounds i8, ptr %a, i64 8
+  %16 = load i8, ptr %arrayidx.8, align 1
+  %arrayidx2.8 = getelementptr inbounds i8, ptr %r, i64 8
+  %17 = load i8, ptr %arrayidx2.8, align 1
+  %mul.8 = mul i8 %17, %16
+  store i8 %mul.8, ptr %arrayidx2.8, align 1
+  %arrayidx.9 = getelementptr inbounds i8, ptr %a, i64 9
+  %18 = load i8, ptr %arrayidx.9, align 1
+  %arrayidx2.9 = getelementptr inbounds i8, ptr %r, i64 9
+  %19 = load i8, ptr %arrayidx2.9, align 1
+  %mul.9 = mul i8 %19, %18
+  store i8 %mul.9, ptr %arrayidx2.9, align 1
+  %arrayidx.10 = getelementptr inbounds i8, ptr %a, i64 10
+  %20 = load i8, ptr %arrayidx.10, align 1
+  %arrayidx2.10 = getelementptr inbounds i8, ptr %r, i64 10
+  %21 = load i8, ptr %arrayidx2.10, align 1
+  %mul.10 = mul i8 %21, %20
+  store i8 %mul.10, ptr %arrayidx2.10, align 1
+  %arrayidx.11 = getelementptr inbounds i8, ptr %a, i64 11
+  %22 = load i8, ptr %arrayidx.11, align 1
+  %arrayidx2.11 = getelementptr inbounds i8, ptr %r, i64 11
+  %23 = load i8, ptr %arrayidx2.11, align 1
+  %mul.11 = mul i8 %23, %22
+  store i8 %mul.11, ptr %arrayidx2.11, align 1
+  %arrayidx.12 = getelementptr inbounds i8, ptr %a, i64 12
+  %24 = load i8, ptr %arrayidx.12, align 1
+  %arrayidx2.12 = getelementptr inbounds i8, ptr %r, i64 12
+  %25 = load i8, ptr %arrayidx2.12, align 1
+  %mul.12 = mul i8 %25, %24
+  store i8 %mul.12, ptr %arrayidx2.12, align 1
+  %arrayidx.13 = getelementptr inbounds i8, ptr %a, i64 13
+  %26 = load i8, ptr %arrayidx.13, align 1
+  %arrayidx2.13 = getelementptr inbounds i8, ptr %r, i64 13
+  %27 = load i8, ptr %arrayidx2.13, align 1
+  %mul.13 = mul i8 %27, %26
+  store i8 %mul.13, ptr %arrayidx2.13, align 1
+  %arrayidx.14 = getelementptr inbounds i8, ptr %a, i64 14
+  %28 = load i8, ptr %arrayidx.14, align 1
+  %arrayidx2.14 = getelementptr inbounds i8, ptr %r, i64 14
+  %29 = load i8, ptr %arrayidx2.14, align 1
+  %mul.14 = mul i8 %29, %28
+  store i8 %mul.14, ptr %arrayidx2.14, align 1
+  %arrayidx.15 = getelementptr inbounds i8, ptr %a, i64 15
+  %30 = load i8, ptr %arrayidx.15, align 1
+  %arrayidx2.15 = getelementptr inbounds i8, ptr %r, i64 15
+  %31 = load i8, ptr %arrayidx2.15, align 1
+  %mul.15 = mul i8 %31, %30
+  store i8 %mul.15, ptr %arrayidx2.15, align 1
+  %arrayidx.16 = getelementptr inbounds i8, ptr %a, i64 16
+  %32 = load i8, ptr %arrayidx.16, align 1
+  %arrayidx2.16 = getelementptr inbounds i8, ptr %r, i64 16
+  %33 = load i8, ptr %arrayidx2.16, align 1
+  %mul.16 = mul i8 %33, %32
+  store i8 %mul.16, ptr %arrayidx2.16, align 1
+  %arrayidx.17 = getelementptr inbounds i8, ptr %a, i64 17
+  %34 = load i8, ptr %arrayidx.17, align 1
+  %arrayidx2.17 = getelementptr inbounds i8, ptr %r, i64 17
+  %35 = load i8, ptr %arrayidx2.17, align 1
+  %mul.17 = mul i8 %35, %34
+  store i8 %mul.17, ptr %arrayidx2.17, align 1
+  %arrayidx.18 = getelementptr inbounds i8, ptr %a, i64 18
+  %36 = load i8, ptr %arrayidx.18, align 1
+  %arrayidx2.18 = getelementptr inbounds i8, ptr %r, i64 18
+  %37 = load i8, ptr %arrayidx2.18, align 1
+  %mul.18 = mul i8 %37, %36
+  store i8 %mul.18, ptr %arrayidx2.18, align 1
+  %arrayidx.19 = getelementptr inbounds i8, ptr %a, i64 19
+  %38 = load i8, ptr %arrayidx.19, align 1
+  %arrayidx2.19 = getelementptr inbounds i8, ptr %r, i64 19
+  %39 = load i8, ptr %arrayidx2.19, align 1
+  %mul.19 = mul i8 %39, %38
+  store i8 %mul.19, ptr %arrayidx2.19, align 1
+  %arrayidx.20 = getelementptr inbounds i8, ptr %a, i64 20
+  %40 = load i8, ptr %arrayidx.20, align 1
+  %arrayidx2.20 = getelementptr inbounds i8, ptr %r, i64 20
+  %41 = load i8, ptr %arrayidx2.20, align 1
+  %mul.20 = mul i8 %41, %40
+  store i8 %mul.20, ptr %arrayidx2.20, align 1
+  %arrayidx.21 = getelementptr inbounds i8, ptr %a, i64 21
+  %42 = load i8, ptr %arrayidx.21, align 1
+  %arrayidx2.21 = getelementptr inbounds i8, ptr %r, i64 21
+  %43 = load i8, ptr %arrayidx2.21, align 1
+  %mul.21 = mul i8 %43, %42
+  store i8 %mul.21, ptr %arrayidx2.21, align 1
+  %arrayidx.22 = getelementptr inbounds i8, ptr %a, i64 22
+  %44 = load i8, ptr %arrayidx.22, align 1
+  %arrayidx2.22 = getelementptr inbounds i8, ptr %r, i64 22
+  %45 = load i8, ptr %arrayidx2.22, align 1
+  %mul.22 = mul i8 %45, %44
+  store i8 %mul.22, ptr %arrayidx2.22, align 1
+  %arrayidx.23 = getelementptr inbounds i8, ptr %a, i64 23
+  %46 = load i8, ptr %arrayidx.23, align 1
+  %arrayidx2.23 = getelementptr inbounds i8, ptr %r, i64 23
+  %47 = load i8, ptr %arrayidx2.23, align 1
+  %mul.23 = mul i8 %47, %46
+  store i8 %mul.23, ptr %arrayidx2.23, align 1
+  %arrayidx.24 = getelementptr inbounds i8, ptr %a, i64 24
+  %48 = load i8, ptr %arrayidx.24, align 1
+  %arrayidx2.24 = getelementptr inbounds i8, ptr %r, i64 24
+  %49 = load i8, ptr %arrayidx2.24, align 1
+  %mul.24 = mul i8 %49, %48
+  store i8 %mul.24, ptr %arrayidx2.24, align 1
+  %arrayidx.25 = getelementptr inbounds i8, ptr %a, i64 25
+  %50 = load i8, ptr %arrayidx.25, align 1
+  %arrayidx2.25 = getelementptr inbounds i8, ptr %r, i64 25
+  %51 = load i8, ptr %arrayidx2.25, align 1
+  %mul.25 = mul i8 %51, %50
+  store i8 %mul.25, ptr %arrayidx2.25, align 1
+  %arrayidx.26 = getelementptr inbounds i8, ptr %a, i64 26
+  %52 = load i8, ptr %arrayidx.26, align 1
+  %arrayidx2.26 = getelementptr inbounds i8, ptr %r, i64 26
+  %53 = load i8, ptr %arrayidx2.26, align 1
+  %mul.26 = mul i8 %53, %52
+  store i8 %mul.26, ptr %arrayidx2.26, align 1
+  %arrayidx.27 = getelementptr inbounds i8, ptr %a, i64 27
+  %54 = load i8, ptr %arrayidx.27, align 1
+  %arrayidx2.27 = getelementptr inbounds i8, ptr %r, i64 27
+  %55 = load i8, ptr %arrayidx2.27, align 1
+  %mul.27 = mul i8 %55, %54
+  store i8 %mul.27, ptr %arrayidx2.27, align 1
+  %arrayidx.28 = getelementptr inbounds i8, ptr %a, i64 28
+  %56 = load i8, ptr %arrayidx.28, align 1
+  %arrayidx2.28 = getelementptr inbounds i8, ptr %r, i64 28
+  %57 = load i8, ptr %arrayidx2.28, align 1
+  %mul.28 = mul i8 %57, %56
+  store i8 %mul.28, ptr %arrayidx2.28, align 1
+  %arrayidx.29 = getelementptr inbounds i8, ptr %a, i64 29
+  %58 = load i8, ptr %arrayidx.29, align 1
+  %arrayidx2.29 = getelementptr inbounds i8, ptr %r, i64 29
+  %59 = load i8, ptr %arrayidx2.29, align 1
+  %mul.29 = mul i8 %59, %58
+  store i8 %mul.29, ptr %arrayidx2.29, align 1
+  %arrayidx.30 = getelementptr inbounds i8, ptr %a, i64 30
+  %60 = load i8, ptr %arrayidx.30, align 1
+  %arrayidx2.30 = getelementptr inbounds i8, ptr %r, i64 30
+  %61 = load i8, ptr %arrayidx2.30, align 1
+  %mul.30 = mul i8 %61, %60
+  store i8 %mul.30, ptr %arrayidx2.30, align 1
+  %arrayidx.31 = getelementptr inbounds i8, ptr %a, i64 31
+  %62 = load i8, ptr %arrayidx.31, align 1
+  %arrayidx2.31 = getelementptr inbounds i8, ptr %r, i64 31
+  %63 = load i8, ptr %arrayidx2.31, align 1
+  %mul.31 = mul i8 %63, %62
+  store i8 %mul.31, ptr %arrayidx2.31, align 1
+  ret void
+}


        


More information about the llvm-commits mailing list