[llvm] r374183 - [SLP] respect target register width for GEP vectorization (PR43578)

Wed Oct 9 09:32:49 PDT 2019

Author: spatel
Date: Wed Oct  9 09:32:49 2019
New Revision: 374183

URL: http://llvm.org/viewvc/llvm-project?rev=374183&view=rev
Log:
[SLP] respect target register width for GEP vectorization (PR43578)

We failed to account for the target register width (max vector factor)
when vectorizing starting from GEPs. This causes vectorization to
proceed to obviously illegal widths as in:
https://bugs.llvm.org/show_bug.cgi?id=43578

For x86, this also means that SLP can produce rogue AVX or AVX512
code even when the user specifies a narrower vector width.

The AArch64 test in ext-trunc.ll appears to be better using the
narrower width. I'm not exactly sure what getelementptr.ll is trying
to do, but it's testing with "-slp-threshold=-18", so I'm not worried
about those diffs. The x86 test is an over-reduction from SPEC h264;
this patch appears to restore the perf loss caused by SLP when using
-march=haswell.

Differential Revision: https://reviews.llvm.org/D68667

Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
    llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/load-merge.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=374183&r1=374182&r2=374183&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Wed Oct  9 09:32:49 2019
@@ -6981,10 +6981,16 @@ bool SLPVectorizerPass::vectorizeGEPIndi
     LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
                       << Entry.second.size() << ".\n");
 
-    // We process the getelementptr list in chunks of 16 (like we do for
-    // stores) to minimize compile-time.
-    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
-      auto Len = std::min<unsigned>(BE - BI, 16);
+    // Process the GEP list in chunks suitable for the target's supported
+    // vector size. If a vector register can't hold 1 element, we are done.
+    unsigned MaxVecRegSize = R.getMaxVecRegSize();
+    unsigned EltSize = R.getVectorElementSize(Entry.second[0]);
+    if (MaxVecRegSize < EltSize)
+      continue;
+
+    unsigned MaxElts = MaxVecRegSize / EltSize;
+    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
+      auto Len = std::min<unsigned>(BE - BI, MaxElts);
       auto GEPList = makeArrayRef(&Entry.second[BI], Len);
 
       // Initialize a set a candidate getelementptrs. Note that we use a

Modified: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll?rev=374183&r1=374182&r2=374183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll Wed Oct  9 09:32:49 2019
@@ -61,23 +61,25 @@ define void @test2(<4 x i16> %a, <4 x i1
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[C0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[A0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
+; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
+; CHECK-NEXT:    [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A1]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
+; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A2]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP9]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A3]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void

Modified: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll?rev=374183&r1=374182&r2=374183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll Wed Oct  9 09:32:49 2019
@@ -23,10 +23,7 @@ target triple = "aarch64--linux-gnu"
 ; }
 ;
 
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
-; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-LABEL: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
 ; YAML-NEXT:   - Cost:            '11'
@@ -39,7 +36,7 @@ target triple = "aarch64--linux-gnu"
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '16'
+; YAML-NEXT:   - Cost:            '6'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -49,49 +46,50 @@ define i32 @getelementptr_4x32(i32* noca
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, i32 [[X:%.*]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[Y:%.*]], i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 undef>, i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP22:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP22]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[T4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i32> [[TMP7]], [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <2 x i32> [[TMP7]], [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[ADD11]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
-; CHECK-NEXT:    [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP22]] = add nsw <2 x i32> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[TMP22]], i32 0
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP23]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
 entry:
@@ -133,10 +131,7 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
 }
 
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
-; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-LABEL: Function:        getelementptr_2x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
 ; YAML-NEXT:   - Cost:            '11'

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/load-merge.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/load-merge.ll?rev=374183&r1=374182&r2=374183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/load-merge.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/load-merge.ll Wed Oct  9 09:32:49 2019
@@ -153,19 +153,24 @@ define void @PR43578_prefer128(i32* %r,
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 2
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <4 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[Q0]] to <4 x i64>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <4 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP14]]
 ; CHECK-NEXT:    ret void
 ;
   %p0 = getelementptr inbounds i64, i64* %p, i64 0