[llvm] 6da3cfc - [Thumb2] Convert some tests to opaque pointers (NFC)

Sun Jan 29 19:34:04 PST 2023

Author: Sergei Barannikov
Date: 2023-01-30T06:32:32+03:00
New Revision: 6da3cfc357dc473f10169928437e113d34c7d283

URL: https://github.com/llvm/llvm-project/commit/6da3cfc357dc473f10169928437e113d34c7d283
DIFF: https://github.com/llvm/llvm-project/commit/6da3cfc357dc473f10169928437e113d34c7d283.diff

LOG: [Thumb2] Convert some tests to opaque pointers (NFC)

Added: 
    

Modified: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/begin-vpt-without-inst.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/v2i1-upgrade.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/begin-vpt-without-inst.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/begin-vpt-without-inst.mir
index f79169a94bc3b..372fc6108129c 100644

--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/begin-vpt-without-inst.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/begin-vpt-without-inst.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -opaque-pointers=0 -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
 
 --- |
   @arr = external dso_local local_unnamed_addr global [0 x i32], align 4
@@ -15,7 +15,7 @@
 
   vector.ph:                                        ; preds = %vector.ph.preheader, %vector.ph
     %i.addr.012 = phi i32 [ %math, %vector.ph ], [ %i, %vector.ph.preheader ]
-    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> <i32 2, i32 2, i32 2, i32 2>, <4 x i32>* bitcast ([0 x i32]* @arr to <4 x i32>*), i32 4, <4 x i1> %active.lane.mask)
+    call void @llvm.masked.store.v4i32.p0(<4 x i32> <i32 2, i32 2, i32 2, i32 2>, ptr @arr, i32 4, <4 x i1> %active.lane.mask)
     %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %i.addr.012, i32 1)
     %math = extractvalue { i32, i1 } %0, 0
     %ov = extractvalue { i32, i1 } %0, 1
@@ -26,7 +26,7 @@
   }
 
   declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
-  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+  declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
   declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32)
 
 ...
@@ -75,7 +75,7 @@ body:             |
   ; CHECK:   liveins: $vpr, $q0, $r0, $r1
   ; CHECK:   renamable $r0, $cpsr = tADDi8 killed renamable $r0, 1, 14 /* CC::al */, $noreg
   ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   MVE_VSTRWU32 renamable $q0, renamable $r1, 0, 1, renamable $vpr, $noreg :: (store (s128) into `<4 x i32>* bitcast ([0 x i32]* @arr to <4 x i32>*)`, align 4)
+  ; CHECK:   MVE_VSTRWU32 renamable $q0, renamable $r1, 0, 1, renamable $vpr, $noreg :: (store (s128) into @arr, align 4)
   ; CHECK:   tBcc %bb.2, 3 /* CC::lo */, killed $cpsr
   ; CHECK: bb.3.for.end5:
   ; CHECK:   tBX_RET 14 /* CC::al */, $noreg
@@ -105,7 +105,7 @@ body:             |
 
     renamable $r0, $cpsr = tADDi8 killed renamable $r0, 1, 14 /* CC::al */, $noreg
     MVE_VPST 8, implicit $vpr
-    MVE_VSTRWU32 renamable $q0, renamable $r1, 0, 1, renamable $vpr, $noreg :: (store (s128) into `<4 x i32>* bitcast ([0 x i32]* @arr to <4 x i32>*)`, align 4)
+    MVE_VSTRWU32 renamable $q0, renamable $r1, 0, 1, renamable $vpr, $noreg :: (store (s128) into @arr, align 4)
     tBcc %bb.2, 3 /* CC::lo */, killed $cpsr
 
   bb.3.for.end5:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index 2eaba6d0dca9c..99d169e63e5a5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -opaque-pointers=0 --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
+; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
 
-define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
+define void @test_memcpy(ptr nocapture %x, ptr nocapture readonly %y, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memcpy:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
@@ -42,49 +42,46 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 
 for.body:                                         ; preds = %entry, %for.body
   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %x.addr.010 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
-  %y.addr.09 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
-  %0 = bitcast i32* %x.addr.010 to i8*
-  %1 = bitcast i32* %y.addr.09 to i8*
-  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %m, i1 false)
-  %add.ptr = getelementptr inbounds i32, i32* %x.addr.010, i32 %m
-  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.09, i32 %m
+  %x.addr.010 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
+  %y.addr.09 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
+  tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %x.addr.010, ptr align 4 %y.addr.09, i32 %m, i1 false)
+  %add.ptr = getelementptr inbounds i32, ptr %x.addr.010, i32 %m
+  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.09, i32 %m
   %inc = add nuw nsw i32 %i.011, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {
+define void @test_memset(ptr nocapture %x, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memset:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    poplt {r4, pc}
-; CHECK-NEXT:  .LBB1_1: @ %for.body.preheader
-; CHECK-NEXT:    lsl.w r12, r2, #2
+; CHECK-NEXT:    poplt {r7, pc}
+; CHECK-NEXT:  .LBB1_1:
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    b .LBB1_2
 ; CHECK-NEXT:  .LBB1_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_4 Depth 2
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r12, r0
 ; CHECK-NEXT:    wlstp.8 lr, r2, .LBB1_3
 ; CHECK-NEXT:    b .LBB1_4
 ; CHECK-NEXT:  .LBB1_3: @ %for.body
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    add r0, r12
+; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
 ; CHECK-NEXT:    subs r1, #1
 ; CHECK-NEXT:    beq .LBB1_5
 ; CHECK-NEXT:    b .LBB1_2
 ; CHECK-NEXT:  .LBB1_4: @ Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrb.8 q0, [r4], #16
+; CHECK-NEXT:    vstrb.8 q0, [r12], #16
 ; CHECK-NEXT:    letp lr, .LBB1_4
 ; CHECK-NEXT:    b .LBB1_3
 ; CHECK-NEXT:  .LBB1_5: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp5 = icmp sgt i32 %n, 0
   br i1 %cmp5, label %for.body, label %for.cond.cleanup
@@ -94,16 +91,15 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 
 for.body:                                         ; preds = %entry, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %x.addr.06 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
-  %0 = bitcast i32* %x.addr.06 to i8*
-  tail call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 0, i32 %m, i1 false)
-  %add.ptr = getelementptr inbounds i32, i32* %x.addr.06, i32 %m
+  %x.addr.06 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
+  tail call void @llvm.memset.p0.i32(ptr align 4 %x.addr.06, i8 0, i32 %m, i1 false)
+  %add.ptr = getelementptr inbounds i32, ptr %x.addr.06, i32 %m
   %inc = add nuw nsw i32 %i.07, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-define void @test_memmove(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
+define void @test_memmove(ptr nocapture %x, ptr nocapture readonly %y, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memmove:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
@@ -140,20 +136,18 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 
 for.body:                                         ; preds = %entry, %for.body
   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %x.addr.010 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
-  %y.addr.09 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
-  %0 = bitcast i32* %x.addr.010 to i8*
-  %1 = bitcast i32* %y.addr.09 to i8*
-  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %m, i1 false)
-  %add.ptr = getelementptr inbounds i32, i32* %x.addr.010, i32 %m
-  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.09, i32 %m
+  %x.addr.010 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
+  %y.addr.09 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
+  tail call void @llvm.memmove.p0.p0.i32(ptr align 4 %x.addr.010, ptr align 4 %y.addr.09, i32 %m, i1 false)
+  %add.ptr = getelementptr inbounds i32, ptr %x.addr.010, i32 %m
+  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.09, i32 %m
   %inc = add nuw nsw i32 %i.011, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
 
-define void @test_memcpy16(i32* nocapture %x, i32* nocapture readonly %y, i32 %n) {
+define void @test_memcpy16(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; CHECK-LABEL: test_memcpy16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
@@ -183,19 +177,17 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 
 for.body:                                         ; preds = %entry, %for.body
   %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %x.addr.08 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
-  %y.addr.07 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
-  %0 = bitcast i32* %x.addr.08 to i8*
-  %1 = bitcast i32* %y.addr.07 to i8*
-  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 dereferenceable(16) %0, i8* nonnull align 4 dereferenceable(16) %1, i32 16, i1 false)
-  %add.ptr = getelementptr inbounds i32, i32* %x.addr.08, i32 16
-  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.07, i32 16
+  %x.addr.08 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
+  %y.addr.07 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
+  tail call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 4 dereferenceable(16) %x.addr.08, ptr nonnull align 4 dereferenceable(16) %y.addr.07, i32 16, i1 false)
+  %add.ptr = getelementptr inbounds i32, ptr %x.addr.08, i32 16
+  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.07, i32 16
   %inc = add nuw nsw i32 %i.09, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-define void @test_memset16(i32* nocapture %x, i32 %n) {
+define void @test_memset16(ptr nocapture %x, i32 %n) {
 ; CHECK-LABEL: test_memset16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
@@ -223,16 +215,15 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 
 for.body:                                         ; preds = %entry, %for.body
   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %x.addr.05 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
-  %0 = bitcast i32* %x.addr.05 to i8*
-  tail call void @llvm.memset.p0i8.i32(i8* nonnull align 4 dereferenceable(16) %0, i8 0, i32 16, i1 false)
-  %add.ptr = getelementptr inbounds i32, i32* %x.addr.05, i32 16
+  %x.addr.05 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
+  tail call void @llvm.memset.p0.i32(ptr nonnull align 4 dereferenceable(16) %x.addr.05, i8 0, i32 16, i1 false)
+  %add.ptr = getelementptr inbounds i32, ptr %x.addr.05, i32 16
   %inc = add nuw nsw i32 %i.06, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-define void @test_memmove16(i32* nocapture %x, i32* nocapture readonly %y, i32 %n) {
+define void @test_memmove16(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; CHECK-LABEL: test_memmove16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
@@ -262,19 +253,17 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 
 for.body:                                         ; preds = %entry, %for.body
   %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %x.addr.08 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
-  %y.addr.07 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
-  %0 = bitcast i32* %x.addr.08 to i8*
-  %1 = bitcast i32* %y.addr.07 to i8*
-  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* nonnull align 4 dereferenceable(16) %0, i8* nonnull align 4 dereferenceable(16) %1, i32 16, i1 false)
-  %add.ptr = getelementptr inbounds i32, i32* %x.addr.08, i32 16
-  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.07, i32 16
+  %x.addr.08 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
+  %y.addr.07 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
+  tail call void @llvm.memmove.p0.p0.i32(ptr nonnull align 4 dereferenceable(16) %x.addr.08, ptr nonnull align 4 dereferenceable(16) %y.addr.07, i32 16, i1 false)
+  %add.ptr = getelementptr inbounds i32, ptr %x.addr.08, i32 16
+  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.07, i32 16
   %inc = add nuw nsw i32 %i.09, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) {
+define void @test_memset_preheader(ptr %x, ptr %y, i32 %n) {
 ; CHECK-LABEL: test_memset_preheader:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
@@ -308,28 +297,28 @@ entry:
   br i1 %cmp6, label %prehead, label %for.cond.cleanup
 
 prehead:
-  call void @llvm.memset.p0i8.i32(i8* %x, i8 0, i32 %n, i1 false)
+  call void @llvm.memset.p0.i32(ptr %x, i8 0, i32 %n, i1 false)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
   %i.09 = phi i32 [ %inc, %for.body ], [ 0, %prehead ]
-  %x.addr.08 = phi i8* [ %add.ptr, %for.body ], [ %x, %prehead ]
-  %y.addr.07 = phi i8* [ %add.ptr1, %for.body ], [ %y, %prehead ]
-  %add.ptr = getelementptr inbounds i8, i8* %x.addr.08, i32 1
-  %add.ptr1 = getelementptr inbounds i8, i8* %y.addr.07, i32 1
-  %l = load i8, i8* %x.addr.08
-  store i8 %l, i8* %y.addr.07
+  %x.addr.08 = phi ptr [ %add.ptr, %for.body ], [ %x, %prehead ]
+  %y.addr.07 = phi ptr [ %add.ptr1, %for.body ], [ %y, %prehead ]
+  %add.ptr = getelementptr inbounds i8, ptr %x.addr.08, i32 1
+  %add.ptr1 = getelementptr inbounds i8, ptr %y.addr.07, i32 1
+  %l = load i8, ptr %x.addr.08
+  store i8 %l, ptr %y.addr.07
   %inc = add nuw nsw i32 %i.09, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 
 for.cond.cleanup:                                 ; preds = %for.body, %entry
-  call void @llvm.memset.p0i8.i32(i8* %x, i8 0, i32 %n, i1 false)
+  call void @llvm.memset.p0.i32(ptr %x, i8 0, i32 %n, i1 false)
   ret void
 }
 
 
 
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
-declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)
-declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1 immarg)
+declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg)
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
+declare void @llvm.memmove.p0.p0.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1 immarg)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
index a687eac32dfce..9ef5a46edf934 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -opaque-pointers=0 -mtriple=thumbv8.1m.main -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
 
 %struct.arm_2d_size_t = type { i16, i16 }
-define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(i16* noalias nocapture %phwTargetBase, i16 signext %iTargetStride, %struct.arm_2d_size_t* noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) {
+define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture %phwTargetBase, i16 signext %iTargetStride, ptr noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) {
 ; CHECK-LABEL: __arm_2d_impl_rgb16_colour_filling_with_alpha:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
@@ -37,12 +37,11 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(i16* noalias nocaptur
 ; CHECK-NEXT:    subs r3, #8
 ; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    vdup.16 q0, r5
-; CHECK-NEXT:    lsls r1, r1, #1
+; CHECK-NEXT:    vdup.16 q5, r6
 ; CHECK-NEXT:    add.w r3, r4, r3, lsr #3
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.i16 q0, #0xf800
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    vdup.16 q5, r6
 ; CHECK-NEXT:    vmov.i16 q7, #0x78
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
@@ -94,7 +93,7 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(i16* noalias nocaptur
 ; CHECK-NEXT:  @ %bb.4: @ %for.cond3.for.cond.cleanup7_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    adds r4, #1
-; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    add.w r0, r0, r1, lsl #1
 ; CHECK-NEXT:    cmp r4, r12
 ; CHECK-NEXT:    bne .LBB0_2
 ; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
@@ -103,8 +102,8 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(i16* noalias nocaptur
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
-  %iHeight = getelementptr inbounds %struct.arm_2d_size_t, %struct.arm_2d_size_t* %ptCopySize, i32 0, i32 1
-  %0 = load i16, i16* %iHeight, align 2
+  %iHeight = getelementptr inbounds %struct.arm_2d_size_t, ptr %ptCopySize, i32 0, i32 1
+  %0 = load i16, ptr %iHeight, align 2
   %conv1 = sext i16 %0 to i32
   %and.i = shl i16 %hwColour, 3
   %shl.i = and i16 %and.i, 248
@@ -119,8 +118,7 @@ entry:
   br i1 %cmp61, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
 
 for.cond3.preheader.lr.ph:                        ; preds = %entry
-  %iWidth = getelementptr inbounds %struct.arm_2d_size_t, %struct.arm_2d_size_t* %ptCopySize, i32 0, i32 0
-  %6 = load i16, i16* %iWidth, align 2
+  %6 = load i16, ptr %ptCopySize, align 2
   %conv4 = sext i16 %6 to i32
   %cmp558 = icmp sgt i16 %6, 0
   br i1 %cmp558, label %for.cond3.preheader.us.preheader, label %for.cond.cleanup
@@ -142,42 +140,40 @@ for.cond3.preheader.us.preheader:                 ; preds = %for.cond3.preheader
   br label %vector.ph
 
 vector.ph:                                        ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.us.preheader
-  %phwTargetBase.addr.063.us = phi i16* [ %add.ptr.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ %phwTargetBase, %for.cond3.preheader.us.preheader ]
+  %phwTargetBase.addr.063.us = phi ptr [ %add.ptr.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ %phwTargetBase, %for.cond3.preheader.us.preheader ]
   %y.062.us = phi i32 [ %inc32.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %next.gep = getelementptr i16, i16* %phwTargetBase.addr.063.us, i32 %index
+  %next.gep = getelementptr i16, ptr %phwTargetBase.addr.063.us, i32 %index
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %conv4)
-  %7 = bitcast i16* %next.gep to <8 x i16>*
-  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %7, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
-  %8 = shl <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %9 = and <8 x i16> %8, <i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248>
-  %10 = lshr <8 x i16> %wide.masked.load, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
-  %11 = and <8 x i16> %10, <i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120>
-  %12 = lshr <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %13 = and <8 x i16> %12, <i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252>
-  %14 = mul <8 x i16> %9, %broadcast.splat76
-  %15 = add <8 x i16> %14, %broadcast.splat78
-  %16 = lshr <8 x i16> %15, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
-  %17 = mul <8 x i16> %13, %broadcast.splat76
-  %18 = add <8 x i16> %17, %broadcast.splat80
-  %19 = mul <8 x i16> %11, %broadcast.splat76
-  %20 = add <8 x i16> %19, %broadcast.splat82
-  %21 = lshr <8 x i16> %18, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-  %22 = and <8 x i16> %21, <i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016>
-  %23 = or <8 x i16> %22, %16
-  %24 = and <8 x i16> %20, <i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048>
-  %25 = or <8 x i16> %23, %24
-  %26 = bitcast i16* %next.gep to <8 x i16>*
-  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %25, <8 x i16>* %26, i32 2, <8 x i1> %active.lane.mask)
+  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
+  %7 = shl <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %8 = and <8 x i16> %7, <i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248>
+  %9 = lshr <8 x i16> %wide.masked.load, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+  %10 = and <8 x i16> %9, <i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120>
+  %11 = lshr <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %12 = and <8 x i16> %11, <i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252>
+  %13 = mul <8 x i16> %8, %broadcast.splat76
+  %14 = add <8 x i16> %13, %broadcast.splat78
+  %15 = lshr <8 x i16> %14, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
+  %16 = mul <8 x i16> %12, %broadcast.splat76
+  %17 = add <8 x i16> %16, %broadcast.splat80
+  %18 = mul <8 x i16> %10, %broadcast.splat76
+  %19 = add <8 x i16> %18, %broadcast.splat82
+  %20 = lshr <8 x i16> %17, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %21 = and <8 x i16> %20, <i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016>
+  %22 = or <8 x i16> %21, %15
+  %23 = and <8 x i16> %19, <i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048>
+  %24 = or <8 x i16> %22, %23
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> %24, ptr %next.gep, i32 2, <8 x i1> %active.lane.mask)
   %index.next = add i32 %index, 8
-  %27 = icmp eq i32 %index.next, %n.vec
-  br i1 %27, label %for.cond3.for.cond.cleanup7_crit_edge.us, label %vector.body
+  %25 = icmp eq i32 %index.next, %n.vec
+  br i1 %25, label %for.cond3.for.cond.cleanup7_crit_edge.us, label %vector.body
 
 for.cond3.for.cond.cleanup7_crit_edge.us:         ; preds = %vector.body
-  %add.ptr.us = getelementptr inbounds i16, i16* %phwTargetBase.addr.063.us, i32 %conv30
+  %add.ptr.us = getelementptr inbounds i16, ptr %phwTargetBase.addr.063.us, i32 %conv30
   %inc32.us = add nuw nsw i32 %y.062.us, 1
   %exitcond66.not = icmp eq i32 %inc32.us, %conv1
   br i1 %exitcond66.not, label %for.cond.cleanup, label %vector.ph
@@ -185,7 +181,7 @@ for.cond3.for.cond.cleanup7_crit_edge.us:         ; preds = %vector.body
 for.cond.cleanup:                                 ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.lr.ph, %entry
   ret void
 }
-define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(i16* noalias nocapture %phwTargetBase, i16 signext %iTargetStride, %struct.arm_2d_size_t* noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) "target-cpu"="cortex-m55" {
+define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias nocapture %phwTargetBase, i16 signext %iTargetStride, ptr noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) "target-cpu"="cortex-m55" {
 ; CHECK-LABEL: __arm_2d_impl_rgb16_colour_filling_with_alpha_sched:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
@@ -219,7 +215,6 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(i16* noalias no
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vdup.16 q0, r5
 ; CHECK-NEXT:    rsb.w r3, r7, #256
-; CHECK-NEXT:    lsls r7, r1, #1
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vdup.16 q0, r6
 ; CHECK-NEXT:    vmov.i16 q2, #0xf8
@@ -266,8 +261,8 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(i16* noalias no
 ; CHECK-NEXT:    letp lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB1_3 Depth=1
+; CHECK-NEXT:    add.w r0, r0, r1, lsl #1
 ; CHECK-NEXT:    adds r4, #1
-; CHECK-NEXT:    add r0, r7
 ; CHECK-NEXT:    cmp r4, r12
 ; CHECK-NEXT:    bne .LBB1_3
 ; CHECK-NEXT:  .LBB1_6: @ %for.cond.cleanup
@@ -276,8 +271,8 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(i16* noalias no
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
-  %iHeight = getelementptr inbounds %struct.arm_2d_size_t, %struct.arm_2d_size_t* %ptCopySize, i32 0, i32 1
-  %0 = load i16, i16* %iHeight, align 2
+  %iHeight = getelementptr inbounds %struct.arm_2d_size_t, ptr %ptCopySize, i32 0, i32 1
+  %0 = load i16, ptr %iHeight, align 2
   %conv1 = sext i16 %0 to i32
   %and.i = shl i16 %hwColour, 3
   %shl.i = and i16 %and.i, 248
@@ -292,8 +287,7 @@ entry:
   br i1 %cmp61, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
 
 for.cond3.preheader.lr.ph:                        ; preds = %entry
-  %iWidth = getelementptr inbounds %struct.arm_2d_size_t, %struct.arm_2d_size_t* %ptCopySize, i32 0, i32 0
-  %6 = load i16, i16* %iWidth, align 2
+  %6 = load i16, ptr %ptCopySize, align 2
   %conv4 = sext i16 %6 to i32
   %cmp558 = icmp sgt i16 %6, 0
   br i1 %cmp558, label %for.cond3.preheader.us.preheader, label %for.cond.cleanup
@@ -315,42 +309,40 @@ for.cond3.preheader.us.preheader:                 ; preds = %for.cond3.preheader
   br label %vector.ph
 
 vector.ph:                                        ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.us.preheader
-  %phwTargetBase.addr.063.us = phi i16* [ %add.ptr.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ %phwTargetBase, %for.cond3.preheader.us.preheader ]
+  %phwTargetBase.addr.063.us = phi ptr [ %add.ptr.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ %phwTargetBase, %for.cond3.preheader.us.preheader ]
   %y.062.us = phi i32 [ %inc32.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %next.gep = getelementptr i16, i16* %phwTargetBase.addr.063.us, i32 %index
+  %next.gep = getelementptr i16, ptr %phwTargetBase.addr.063.us, i32 %index
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %conv4)
-  %7 = bitcast i16* %next.gep to <8 x i16>*
-  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %7, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
-  %8 = shl <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %9 = and <8 x i16> %8, <i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248>
-  %10 = lshr <8 x i16> %wide.masked.load, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
-  %11 = and <8 x i16> %10, <i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120>
-  %12 = lshr <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %13 = and <8 x i16> %12, <i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252>
-  %14 = mul <8 x i16> %9, %broadcast.splat76
-  %15 = add <8 x i16> %14, %broadcast.splat78
-  %16 = lshr <8 x i16> %15, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
-  %17 = mul <8 x i16> %13, %broadcast.splat76
-  %18 = add <8 x i16> %17, %broadcast.splat80
-  %19 = mul <8 x i16> %11, %broadcast.splat76
-  %20 = add <8 x i16> %19, %broadcast.splat82
-  %21 = lshr <8 x i16> %18, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-  %22 = and <8 x i16> %21, <i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016>
-  %23 = or <8 x i16> %22, %16
-  %24 = and <8 x i16> %20, <i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048>
-  %25 = or <8 x i16> %23, %24
-  %26 = bitcast i16* %next.gep to <8 x i16>*
-  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %25, <8 x i16>* %26, i32 2, <8 x i1> %active.lane.mask)
+  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
+  %7 = shl <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %8 = and <8 x i16> %7, <i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248>
+  %9 = lshr <8 x i16> %wide.masked.load, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+  %10 = and <8 x i16> %9, <i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120>
+  %11 = lshr <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %12 = and <8 x i16> %11, <i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252>
+  %13 = mul <8 x i16> %8, %broadcast.splat76
+  %14 = add <8 x i16> %13, %broadcast.splat78
+  %15 = lshr <8 x i16> %14, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
+  %16 = mul <8 x i16> %12, %broadcast.splat76
+  %17 = add <8 x i16> %16, %broadcast.splat80
+  %18 = mul <8 x i16> %10, %broadcast.splat76
+  %19 = add <8 x i16> %18, %broadcast.splat82
+  %20 = lshr <8 x i16> %17, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %21 = and <8 x i16> %20, <i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016>
+  %22 = or <8 x i16> %21, %15
+  %23 = and <8 x i16> %19, <i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048>
+  %24 = or <8 x i16> %22, %23
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> %24, ptr %next.gep, i32 2, <8 x i1> %active.lane.mask)
   %index.next = add i32 %index, 8
-  %27 = icmp eq i32 %index.next, %n.vec
-  br i1 %27, label %for.cond3.for.cond.cleanup7_crit_edge.us, label %vector.body
+  %25 = icmp eq i32 %index.next, %n.vec
+  br i1 %25, label %for.cond3.for.cond.cleanup7_crit_edge.us, label %vector.body
 
 for.cond3.for.cond.cleanup7_crit_edge.us:         ; preds = %vector.body
-  %add.ptr.us = getelementptr inbounds i16, i16* %phwTargetBase.addr.063.us, i32 %conv30
+  %add.ptr.us = getelementptr inbounds i16, ptr %phwTargetBase.addr.063.us, i32 %conv30
   %inc32.us = add nuw nsw i32 %y.062.us, 1
   %exitcond66.not = icmp eq i32 %inc32.us, %conv1
   br i1 %exitcond66.not, label %for.cond.cleanup, label %vector.ph
@@ -360,5 +352,5 @@ for.cond.cleanup:                                 ; preds = %for.cond3.for.cond.
 }
 
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
-declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
-declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) #3
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) #2
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>) #3

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
index c626b3b4d12fe..fa6a66b95f654 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -1,4 +1,4 @@
-; RUN: opt -opaque-pointers=0 -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
 
 ; CHECK-LABEL: reduction_i32
 ; CHECK: phi i32 [ 0, %vector.ph ]
@@ -7,9 +7,9 @@
 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ]
 ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
 ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) {
+; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
+; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp5, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
+define i16 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -28,13 +28,11 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ]
   %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
-  %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
+  %tmp2 = getelementptr inbounds i16, ptr %A, i32 %index
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-  %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
-  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
-  %tmp5 = getelementptr inbounds i16, i16* %B, i32 %index
-  %tmp6 = bitcast i16* %tmp5 to <8 x i16>*
-  %wide.masked.load3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
+  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
+  %tmp5 = getelementptr inbounds i16, ptr %B, i32 %index
+  %wide.masked.load3 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp5, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp7 = add <8 x i16> %wide.masked.load, %vec.phi
   %tmp8 = add <8 x i16> %tmp7, %wide.masked.load3
   %index.next = add i32 %index, 8
@@ -69,8 +67,8 @@ for.cond.cleanup:
 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ]
 ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
 ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
+; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
+define i16 @reduction_i32_with_scalar(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -91,10 +89,9 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ]
   %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
-  %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
+  %tmp2 = getelementptr inbounds i16, ptr %A, i32 %index
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-  %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
-  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
+  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
   %tmp6 = add <8 x i16> %tmp5, %wide.masked.load
   %index.next = add nuw nsw i32 %index, 8
@@ -130,7 +127,7 @@ for.cond.cleanup:
 ; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32
 ; CHECK:     ret
 ;
-define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
+define i16 @reduction_not_guarded(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
 entry:
   %tmp = add i32 %N, -1
   %n.rnd.up = add nuw nsw i32 %tmp, 8
@@ -147,10 +144,9 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ]
   %3 = phi i32 [ %start, %entry ], [ %4, %vector.body ]
-  %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
+  %tmp2 = getelementptr inbounds i16, ptr %A, i32 %index
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-  %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
-  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
+  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
   %tmp6 = add <8 x i16> %tmp5, %wide.masked.load
   %index.next = add nuw nsw i32 %index, 8
@@ -175,7 +171,7 @@ middle.block:                                     ; preds = %vector.body
 ; CHECK:       @llvm.arm.mve.vctp
 ; CHECK-NOT:   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask
 ;
-define dso_local void @Correlation(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+define dso_local void @Correlation(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
 entry:
   %conv = sext i16 %N to i32
   %cmp36 = icmp sgt i16 %N, 0
@@ -189,7 +185,7 @@ for.body.lr.ph:
 
 for.body:
   %lsr.iv51 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ]
-  %lsr.iv46 = phi i16* [ %scevgep47, %for.end ], [ %Input, %for.body.lr.ph ]
+  %lsr.iv46 = phi ptr [ %scevgep47, %for.end ], [ %Input, %for.body.lr.ph ]
   %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ]
   %1 = mul nsw i32 %i.037, -1
   %2 = add i32 %0, %1
@@ -207,17 +203,15 @@ vector.ph:                                        ; preds = %for.body
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
-  %lsr.iv48 = phi i16* [ %scevgep49, %vector.body ], [ %lsr.iv46, %vector.ph ]
-  %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
+  %lsr.iv48 = phi ptr [ %scevgep49, %vector.body ], [ %lsr.iv46, %vector.ph ]
+  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %16, %vector.body ]
   %9 = phi i32 [ %start, %vector.ph ], [ %17, %vector.body ]
-  %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>*
-  %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>*
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %8)
-  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv45, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
   %10 = sext <4 x i16> %wide.masked.load to <4 x i32>
-  %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv4850, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
+  %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv48, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
   %11 = sext <4 x i16> %wide.masked.load42 to <4 x i32>
   %12 = mul nsw <4 x i32> %11, %10
   %13 = insertelement <4 x i32> undef, i32 %conv1032, i32 0
@@ -225,8 +219,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %15 = ashr <4 x i32> %12, %14
   %16 = add <4 x i32> %15, %vec.phi
   %index.next = add i32 %index, 4
-  %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
-  %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 4
+  %scevgep = getelementptr i16, ptr %lsr.iv, i32 4
+  %scevgep49 = getelementptr i16, ptr %lsr.iv48, i32 4
   %17 = call i32 @llvm.loop.decrement.reg.i32(i32 %9, i32 1)
   %18 = icmp ne i32 %17, 0
   br i1 %18, label %vector.body, label %middle.block
@@ -240,10 +234,10 @@ for.end:                                          ; preds = %middle.block, %for.
   %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %20, %middle.block ]
   %21 = lshr i32 %Sum.0.lcssa, 16
   %conv13 = trunc i32 %21 to i16
-  %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037
-  store i16 %conv13, i16* %arrayidx14, align 2
+  %arrayidx14 = getelementptr inbounds i16, ptr %Output, i32 %i.037
+  store i16 %conv13, ptr %arrayidx14, align 2
   %inc16 = add nuw nsw i32 %i.037, 1
-  %scevgep47 = getelementptr i16, i16* %lsr.iv46, i32 1
+  %scevgep47 = getelementptr i16, ptr %lsr.iv46, i32 1
   %lsr.iv.next = add i32 %lsr.iv51, -1
   %exitcond39 = icmp eq i32 %inc16, %conv
   br i1 %exitcond39, label %for.end17, label %for.body
@@ -252,11 +246,11 @@ for.end17:                                        ; preds = %for.end, %entry
   ret void
 }
 
-declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
 declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
-declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/v2i1-upgrade.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/v2i1-upgrade.ll
index a5d4830f5e62f..75595e6e5876b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/v2i1-upgrade.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/v2i1-upgrade.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -opaque-pointers=0 -S -o - %s | FileCheck %s
+; RUN: opt -S -o - %s | FileCheck %s
 
 declare <4 x i1> @llvm.arm.mve.vctp64(i32)
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
@@ -9,10 +9,10 @@ declare <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32>, <
 
 declare <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>)
 declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>)
-declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64*, <2 x i64>, i32, i32, i32, <4 x i1>)
+declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0.v2i64.v4i1(ptr, <2 x i64>, i32, i32, i32, <4 x i1>)
 declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>)
 declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>)
-declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64*, <2 x i64>, <2 x i64>, i32, i32, <4 x i1>)
+declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v2i64.v2i64.v4i1(ptr, <2 x i64>, <2 x i64>, i32, i32, <4 x i1>)
 
 declare <2 x i64> @llvm.arm.cde.vcx1q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, i32 immarg, <4 x i1>)
 declare <2 x i64> @llvm.arm.cde.vcx1qa.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, i32 immarg, <4 x i1>)
@@ -112,45 +112,43 @@ entry:
   ret <2 x i64> %2
 }
 
-define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_s64(<2 x i64>* %addr, i16 zeroext %p) {
+define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_s64(ptr %addr, i16 zeroext %p) {
 ; CHECK-LABEL: @test_vldrdq_gather_base_wb_z_s64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ADDR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 664, <2 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP5]], 1
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[ADDR]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[ADDR]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP5]], 0
 ; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
 ;
 entry:
-  %0 = load <2 x i64>, <2 x i64>* %addr, align 8
+  %0 = load <2 x i64>, ptr %addr, align 8
   %1 = zext i16 %p to i32
   %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
   %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 664, <4 x i1> %2)
   %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1
-  store <2 x i64> %4, <2 x i64>* %addr, align 8
+  store <2 x i64> %4, ptr %addr, align 8
   %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0
   ret <2 x i64> %5
 }
 
-define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_s64(i64* %base, <2 x i64> %offset, i16 zeroext %p) {
+define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_s64(ptr %base, <2 x i64> %offset, i16 zeroext %p) {
 ; CHECK-LABEL: @test_vldrdq_gather_offset_z_s64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0, <2 x i1> [[TMP3]])
-; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0.v2i64.v4i1(ptr [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 ;
 entry:
   %0 = zext i16 %p to i32
   %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <4 x i1> %1)
+  %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0.v2i64.v4i1(ptr %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <4 x i1> %1)
   ret <2 x i64> %2
 }
 
@@ -171,41 +169,39 @@ entry:
   ret void
 }
 
-define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_s64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) {
+define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_s64(ptr %addr, <2 x i64> %value, i16 zeroext %p) {
 ; CHECK-LABEL: @test_vstrdq_scatter_base_wb_p_s64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ADDR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 248, <2 x i64> [[VALUE:%.*]], <2 x i1> [[TMP4]])
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[ADDR]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[ADDR]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %0 = load <2 x i64>, <2 x i64>* %addr, align 8
+  %0 = load <2 x i64>, ptr %addr, align 8
   %1 = zext i16 %p to i32
   %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
   %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 248, <2 x i64> %value, <4 x i1> %2)
-  store <2 x i64> %3, <2 x i64>* %addr, align 8
+  store <2 x i64> %3, ptr %addr, align 8
   ret void
 }
 
-define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) {
+define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_s64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) {
 ; CHECK-LABEL: @test_vstrdq_scatter_offset_p_s64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]])
-; CHECK-NEXT:    call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <2 x i1> [[TMP3]])
+; CHECK-NEXT:    call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v2i64.v2i64.v4i1(ptr [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %0 = zext i16 %p to i32
   %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1)
+  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v2i64.v2i64.v4i1(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1)
   ret void
 }