[llvm] [RISCV] Match prefetch address with offset (PR #66072)

Wang Pengcheng via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 18 04:36:56 PDT 2023


https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/66072

>From d06e9024c650a2c8636934be72cd8e50096f5ba7 Mon Sep 17 00:00:00 2001
From: wangpc <wangpengcheng.pp at bytedance.com>
Date: Tue, 12 Sep 2023 19:17:49 +0800
Subject: [PATCH 1/3] [RISCV] Add prefetch test with offsetable addresses

We should be able to merge the offset later.
---
 llvm/test/CodeGen/RISCV/prefetch.ll | 1151 +++++++++++++++++++++++++++
 1 file changed, 1151 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/prefetch.ll b/llvm/test/CodeGen/RISCV/prefetch.ll
index 39732636d298b0f..655f84f1bdfe0a4 100644
--- a/llvm/test/CodeGen/RISCV/prefetch.ll
+++ b/llvm/test/CodeGen/RISCV/prefetch.ll
@@ -40,6 +40,104 @@ define void @test_prefetch_read_locality_0(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_read_locality_0(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_read_locality_0:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_read_locality_0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_read_locality_0:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.r 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_read_locality_0:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.r 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_read_locality_0:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 0, i32 0, i32 1)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 0, i32 0, i32 1)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 0, i32 0, i32 1)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 0, i32 0, i32 1)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 0, i32 0, i32 1)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 0, i32 0, i32 1)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 0, i32 0, i32 1)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 0, i32 0, i32 1)
+  ret void
+}
+
 define void @test_prefetch_write_locality_0(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_write_locality_0:
 ; RV32I:       # %bb.0:
@@ -68,6 +166,104 @@ define void @test_prefetch_write_locality_0(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_write_locality_0(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_write_locality_0:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_write_locality_0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_write_locality_0:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.w 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_write_locality_0:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.w 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_write_locality_0:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 1, i32 0, i32 1)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 1, i32 0, i32 1)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 1, i32 0, i32 1)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 1, i32 0, i32 1)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 1, i32 0, i32 1)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 1, i32 0, i32 1)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 1, i32 0, i32 1)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 1, i32 0, i32 1)
+  ret void
+}
+
 define void @test_prefetch_instruction_locality_0(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_instruction_locality_0:
 ; RV32I:       # %bb.0:
@@ -96,6 +292,104 @@ define void @test_prefetch_instruction_locality_0(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_instruction_locality_0(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_instruction_locality_0:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_instruction_locality_0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_0:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.i 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_0:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.i 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_instruction_locality_0:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 0, i32 0, i32 0)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 0, i32 0, i32 0)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 0, i32 0, i32 0)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 0, i32 0, i32 0)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 0, i32 0, i32 0)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 0, i32 0, i32 0)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 0, i32 0, i32 0)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 0, i32 0, i32 0)
+  ret void
+}
+
 define void @test_prefetch_read_locality_1(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_read_locality_1:
 ; RV32I:       # %bb.0:
@@ -124,6 +418,104 @@ define void @test_prefetch_read_locality_1(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_read_locality_1(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_read_locality_1:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_read_locality_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_read_locality_1:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.r 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_read_locality_1:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.r 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_read_locality_1:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 0, i32 1, i32 1)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 0, i32 1, i32 1)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 0, i32 1, i32 1)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 0, i32 1, i32 1)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 0, i32 1, i32 1)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 0, i32 1, i32 1)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 0, i32 1, i32 1)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 0, i32 1, i32 1)
+  ret void
+}
+
 define void @test_prefetch_write_locality_1(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_write_locality_1:
 ; RV32I:       # %bb.0:
@@ -152,6 +544,104 @@ define void @test_prefetch_write_locality_1(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_write_locality_1(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_write_locality_1:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_write_locality_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_write_locality_1:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.w 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_write_locality_1:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.w 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_write_locality_1:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 1, i32 1, i32 1)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 1, i32 1, i32 1)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 1, i32 1, i32 1)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 1, i32 1, i32 1)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 1, i32 1, i32 1)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 1, i32 1, i32 1)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 1, i32 1, i32 1)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 1, i32 1, i32 1)
+  ret void
+}
+
 define void @test_prefetch_instruction_locality_1(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_instruction_locality_1:
 ; RV32I:       # %bb.0:
@@ -180,6 +670,104 @@ define void @test_prefetch_instruction_locality_1(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_instruction_locality_1(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_instruction_locality_1:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_instruction_locality_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_1:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.i 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_1:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.i 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_instruction_locality_1:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 0, i32 1, i32 0)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 0, i32 1, i32 0)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 0, i32 1, i32 0)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 0, i32 1, i32 0)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 0, i32 1, i32 0)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 0, i32 1, i32 0)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 0, i32 1, i32 0)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 0, i32 1, i32 0)
+  ret void
+}
+
 define void @test_prefetch_read_locality_2(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_read_locality_2:
 ; RV32I:       # %bb.0:
@@ -208,6 +796,104 @@ define void @test_prefetch_read_locality_2(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_read_locality_2(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_read_locality_2:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_read_locality_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_read_locality_2:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.r 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_read_locality_2:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.r 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_read_locality_2:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 0, i32 2, i32 1)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 0, i32 2, i32 1)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 0, i32 2, i32 1)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 0, i32 2, i32 1)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 0, i32 2, i32 1)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 0, i32 2, i32 1)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 0, i32 2, i32 1)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 0, i32 2, i32 1)
+  ret void
+}
+
 define void @test_prefetch_write_locality_2(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_write_locality_2:
 ; RV32I:       # %bb.0:
@@ -236,6 +922,104 @@ define void @test_prefetch_write_locality_2(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_write_locality_2(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_write_locality_2:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_write_locality_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_write_locality_2:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.w 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_write_locality_2:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.w 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_write_locality_2:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 1, i32 2, i32 1)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 1, i32 2, i32 1)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 1, i32 2, i32 1)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 1, i32 2, i32 1)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 1, i32 2, i32 1)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 1, i32 2, i32 1)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 1, i32 2, i32 1)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 1, i32 2, i32 1)
+  ret void
+}
+
 define void @test_prefetch_instruction_locality_2(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_instruction_locality_2:
 ; RV32I:       # %bb.0:
@@ -264,6 +1048,103 @@ define void @test_prefetch_instruction_locality_2(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_instruction_locality_2(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_instruction_locality_2:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_instruction_locality_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_2:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.i 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_2:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.i 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_instruction_locality_2:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 0, i32 2, i32 0)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 0, i32 2, i32 0)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 0, i32 2, i32 0)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 0, i32 2, i32 0)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 0, i32 2, i32 0)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 0, i32 2, i32 0)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 0, i32 2, i32 0)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 0, i32 2, i32 0)
+  ret void
+}
 
 define void @test_prefetch_read_locality_3(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_read_locality_3:
@@ -292,6 +1173,96 @@ define void @test_prefetch_read_locality_3(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_read_locality_3(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_read_locality_3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_read_locality_3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_read_locality_3:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.r 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_read_locality_3:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.r 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_read_locality_3:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 0, i32 3, i32 1)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 0, i32 3, i32 1)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 0, i32 3, i32 1)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 0, i32 3, i32 1)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 0, i32 3, i32 1)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 0, i32 3, i32 1)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 0, i32 3, i32 1)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 0, i32 3, i32 1)
+  ret void
+}
+
 define void @test_prefetch_write_locality_3(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_write_locality_3:
 ; RV32I:       # %bb.0:
@@ -319,6 +1290,96 @@ define void @test_prefetch_write_locality_3(ptr %a) nounwind {
   ret void
 }
 
+define void @test_prefetch_offsetable_write_locality_3(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_write_locality_3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_write_locality_3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_write_locality_3:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.w 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_write_locality_3:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.w 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_write_locality_3:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 1, i32 3, i32 1)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 1, i32 3, i32 1)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 1, i32 3, i32 1)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 1, i32 3, i32 1)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 1, i32 3, i32 1)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 1, i32 3, i32 1)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 1, i32 3, i32 1)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 1, i32 3, i32 1)
+  ret void
+}
+
 define void @test_prefetch_instruction_locality_3(ptr %a) nounwind {
 ; RV32I-LABEL: test_prefetch_instruction_locality_3:
 ; RV32I:       # %bb.0:
@@ -345,3 +1406,93 @@ define void @test_prefetch_instruction_locality_3(ptr %a) nounwind {
   call void @llvm.prefetch(ptr %a, i32 0, i32 3, i32 0)
   ret void
 }
+
+define void @test_prefetch_offsetable_instruction_locality_3(ptr %a) nounwind {
+; RV32I-LABEL: test_prefetch_offsetable_instruction_locality_3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch_offsetable_instruction_locality_3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+;
+; RV32ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_3:
+; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    addi a1, a0, -2048
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a2, a0, 2016
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, 32
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a2, a0, -32
+; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV32ZICBOP-NEXT:    addi a1, a1, -1
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 2047
+; RV32ZICBOP-NEXT:    addi a1, a1, 1
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a1, a0, 16
+; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV32ZICBOP-NEXT:    addi a0, a0, -16
+; RV32ZICBOP-NEXT:    prefetch.i 0(a0)
+; RV32ZICBOP-NEXT:    ret
+;
+; RV64ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_3:
+; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    addi a1, a0, -2048
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a2, a0, 2016
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, 32
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a2, a0, -32
+; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOP-NEXT:    addi a1, a1, -1
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 2047
+; RV64ZICBOP-NEXT:    addi a1, a1, 1
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a1, a0, 16
+; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOP-NEXT:    addi a0, a0, -16
+; RV64ZICBOP-NEXT:    prefetch.i 0(a0)
+; RV64ZICBOP-NEXT:    ret
+;
+; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_instruction_locality_3:
+; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, 1
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 16
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a0, a0, -16
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    ret
+  %valid_addr0 = getelementptr i8, ptr %a, i64 -2048
+  call void @llvm.prefetch(ptr %valid_addr0, i32 0, i32 3, i32 0)
+  %valid_addr1 = getelementptr i8, ptr %a, i64 2016
+  call void @llvm.prefetch(ptr %valid_addr1, i32 0, i32 3, i32 0)
+  %valid_addr2 = getelementptr i8, ptr %a, i64 32
+  call void @llvm.prefetch(ptr %valid_addr2, i32 0, i32 3, i32 0)
+  %valid_addr3 = getelementptr i8, ptr %a, i64 -32
+  call void @llvm.prefetch(ptr %valid_addr3, i32 0, i32 3, i32 0)
+  %invalid_addr0 = getelementptr i8, ptr %a, i64 -2049
+  call void @llvm.prefetch(ptr %invalid_addr0, i32 0, i32 3, i32 0)
+  %invalid_addr1 = getelementptr i8, ptr %a, i64 2048
+  call void @llvm.prefetch(ptr %invalid_addr1, i32 0, i32 3, i32 0)
+  %invalid_addr2 = getelementptr i8, ptr %a, i64 16
+  call void @llvm.prefetch(ptr %invalid_addr2, i32 0, i32 3, i32 0)
+  %invalid_addr3 = getelementptr i8, ptr %a, i64 -16
+  call void @llvm.prefetch(ptr %invalid_addr3, i32 0, i32 3, i32 0)
+  ret void
+}

>From fa8288d3e42b0ab58ef1e32601dbf7fc5b6ea8bb Mon Sep 17 00:00:00 2001
From: wangpc <wangpengcheng.pp at bytedance.com>
Date: Tue, 12 Sep 2023 19:26:41 +0800
Subject: [PATCH 2/3] [RISCV] Match prefetch address with offset

A new ComplexPattern `AddrRegImmLsb00000` is added, which is like
`AddrRegImm` except that if the least significant 5 bits isn't all
zeros, we will fail back to offset 0.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp  |  16 +
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h    |   1 +
 llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td |  18 +-
 llvm/test/CodeGen/RISCV/prefetch.ll          | 414 +++++++------------
 4 files changed, 181 insertions(+), 268 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index e6d0346c45e8d54..6ff7aebf13aa573 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2442,6 +2442,22 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
   return true;
 }
 
+bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
+                                                 SDValue &Offset) {
+
+  bool Found = SelectAddrRegImm(Addr, Base, Offset);
+  assert(Found && "SelectAddrRegImm should always succeed");
+  (void)Found;
+
+  if (auto *OffsetImm = dyn_cast<ConstantSDNode>(Offset))
+    if (OffsetImm->getAPIntValue().countTrailingZeros() >= 5)
+      return true;
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), Addr.getSimpleValueType());
+  return true;
+}
+
 bool RISCVDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
                                         SDValue &ShAmt) {
   ShAmt = N;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index c220b2d57c2e50f..675ab4e74c8f644 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -54,6 +54,7 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool SelectAddrRegImmINX(SDValue Addr, SDValue &Base, SDValue &Offset) {
     return SelectAddrRegImm(Addr, Base, Offset, true);
   }
+  bool SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, SDValue &Offset);
 
   bool SelectAddrRegRegScale(SDValue Addr, unsigned MaxShiftAmount,
                              SDValue &Base, SDValue &Index, SDValue &Scale);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
index 509d1cfcd874444..de8a96607864ef1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
@@ -74,12 +74,16 @@ def PREFETCH_W : Prefetch_ri<0b00011, "prefetch.w">, Sched<[]>;
 // Patterns
 //===----------------------------------------------------------------------===//
 
+def AddrRegImmLsb00000 : ComplexPattern<iPTR, 2, "SelectAddrRegImmLsb00000">;
+
 let Predicates = [HasStdExtZicbop] in {
-  // FIXME: Match address with offset
-  def : Pat<(prefetch GPR:$rs1, imm, imm, (XLenVT 0)),
-            (PREFETCH_I GPR:$rs1, 0)>;
-  def : Pat<(prefetch GPR:$rs1, (XLenVT 0), imm, (XLenVT 1)),
-            (PREFETCH_R GPR:$rs1, 0)>;
-  def : Pat<(prefetch GPR:$rs1, (XLenVT 1), imm, (XLenVT 1)),
-            (PREFETCH_W GPR:$rs1, 0)>;
+  def : Pat<(prefetch (AddrRegImmLsb00000 (XLenVT GPR:$rs1), simm12_lsb00000:$imm12),
+                      imm, imm, (XLenVT 0)),
+            (PREFETCH_I GPR:$rs1, simm12_lsb00000:$imm12)>;
+  def : Pat<(prefetch (AddrRegImmLsb00000 (XLenVT GPR:$rs1), simm12_lsb00000:$imm12),
+                      (XLenVT 0), imm, (XLenVT 1)),
+            (PREFETCH_R GPR:$rs1, simm12_lsb00000:$imm12)>;
+  def : Pat<(prefetch (AddrRegImmLsb00000 (XLenVT GPR:$rs1), simm12_lsb00000:$imm12),
+                      (XLenVT 1), imm, (XLenVT 1)),
+            (PREFETCH_W GPR:$rs1, simm12_lsb00000:$imm12)>;
 }
diff --git a/llvm/test/CodeGen/RISCV/prefetch.ll b/llvm/test/CodeGen/RISCV/prefetch.ll
index 655f84f1bdfe0a4..ae4a903f439b7a8 100644
--- a/llvm/test/CodeGen/RISCV/prefetch.ll
+++ b/llvm/test/CodeGen/RISCV/prefetch.ll
@@ -51,14 +51,11 @@ define void @test_prefetch_offsetable_read_locality_0(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_read_locality_0:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.r -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.r 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.r 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.r -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -72,14 +69,11 @@ define void @test_prefetch_offsetable_read_locality_0(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_read_locality_0:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.r -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.r 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.r 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.r -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -93,18 +87,15 @@ define void @test_prefetch_offsetable_read_locality_0(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_read_locality_0:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
@@ -177,14 +168,11 @@ define void @test_prefetch_offsetable_write_locality_0(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_write_locality_0:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.w -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.w 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.w 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.w -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -198,14 +186,11 @@ define void @test_prefetch_offsetable_write_locality_0(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_write_locality_0:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.w -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.w 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.w 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.w -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -219,18 +204,15 @@ define void @test_prefetch_offsetable_write_locality_0(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_write_locality_0:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
@@ -303,14 +285,11 @@ define void @test_prefetch_offsetable_instruction_locality_0(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_0:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.i -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.i 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.i 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.i -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -324,14 +303,11 @@ define void @test_prefetch_offsetable_instruction_locality_0(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_0:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.i -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.i 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.i 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.i -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -345,18 +321,15 @@ define void @test_prefetch_offsetable_instruction_locality_0(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_instruction_locality_0:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.all
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
@@ -429,14 +402,11 @@ define void @test_prefetch_offsetable_read_locality_1(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_read_locality_1:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.r -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.r 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.r 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.r -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -450,14 +420,11 @@ define void @test_prefetch_offsetable_read_locality_1(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_read_locality_1:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.r -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.r 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.r 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.r -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -471,18 +438,15 @@ define void @test_prefetch_offsetable_read_locality_1(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_read_locality_1:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
@@ -555,14 +519,11 @@ define void @test_prefetch_offsetable_write_locality_1(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_write_locality_1:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.w -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.w 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.w 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.w -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -576,14 +537,11 @@ define void @test_prefetch_offsetable_write_locality_1(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_write_locality_1:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.w -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.w 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.w 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.w -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -597,18 +555,15 @@ define void @test_prefetch_offsetable_write_locality_1(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_write_locality_1:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
@@ -681,14 +636,11 @@ define void @test_prefetch_offsetable_instruction_locality_1(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_1:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.i -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.i 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.i 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.i -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -702,14 +654,11 @@ define void @test_prefetch_offsetable_instruction_locality_1(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_1:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.i -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.i 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.i 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.i -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -723,18 +672,15 @@ define void @test_prefetch_offsetable_instruction_locality_1(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_instruction_locality_1:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.pall
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
@@ -807,14 +753,11 @@ define void @test_prefetch_offsetable_read_locality_2(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_read_locality_2:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.r -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.r 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.r 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.r -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -828,14 +771,11 @@ define void @test_prefetch_offsetable_read_locality_2(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_read_locality_2:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.r -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.r 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.r 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.r -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -849,18 +789,15 @@ define void @test_prefetch_offsetable_read_locality_2(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_read_locality_2:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
@@ -933,14 +870,11 @@ define void @test_prefetch_offsetable_write_locality_2(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_write_locality_2:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.w -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.w 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.w 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.w -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -954,14 +888,11 @@ define void @test_prefetch_offsetable_write_locality_2(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_write_locality_2:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.w -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.w 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.w 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.w -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -975,18 +906,15 @@ define void @test_prefetch_offsetable_write_locality_2(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_write_locality_2:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
@@ -1059,14 +987,11 @@ define void @test_prefetch_offsetable_instruction_locality_2(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_2:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.i -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.i 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.i 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.i -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -1080,14 +1005,11 @@ define void @test_prefetch_offsetable_instruction_locality_2(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_2:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.i -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.i 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.i 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.i -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -1101,18 +1023,15 @@ define void @test_prefetch_offsetable_instruction_locality_2(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_instruction_locality_2:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i -2048(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 2016(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i -32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    ntl.p1
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
@@ -1184,14 +1103,11 @@ define void @test_prefetch_offsetable_read_locality_3(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_read_locality_3:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.r -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.r 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.r 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.r -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.r 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.r 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -1205,14 +1121,11 @@ define void @test_prefetch_offsetable_read_locality_3(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_read_locality_3:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.r -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.r 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.r 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.r -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.r 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.r 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -1226,14 +1139,11 @@ define void @test_prefetch_offsetable_read_locality_3(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_read_locality_3:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r -2048(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 2016(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r -32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a2)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.r 0(a1)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
@@ -1301,14 +1211,11 @@ define void @test_prefetch_offsetable_write_locality_3(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_write_locality_3:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.w -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.w 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.w 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.w -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.w 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.w 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -1322,14 +1229,11 @@ define void @test_prefetch_offsetable_write_locality_3(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_write_locality_3:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.w -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.w 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.w 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.w -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.w 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.w 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -1343,14 +1247,11 @@ define void @test_prefetch_offsetable_write_locality_3(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_write_locality_3:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w -2048(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 2016(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w -32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a2)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.w 0(a1)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047
@@ -1418,14 +1319,11 @@ define void @test_prefetch_offsetable_instruction_locality_3(ptr %a) nounwind {
 ;
 ; RV32ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_3:
 ; RV32ZICBOP:       # %bb.0:
+; RV32ZICBOP-NEXT:    prefetch.i -2048(a0)
+; RV32ZICBOP-NEXT:    prefetch.i 2016(a0)
+; RV32ZICBOP-NEXT:    prefetch.i 32(a0)
+; RV32ZICBOP-NEXT:    prefetch.i -32(a0)
 ; RV32ZICBOP-NEXT:    addi a1, a0, -2048
-; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
-; RV32ZICBOP-NEXT:    addi a2, a0, 2016
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, 32
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV32ZICBOP-NEXT:    addi a2, a0, -32
-; RV32ZICBOP-NEXT:    prefetch.i 0(a2)
 ; RV32ZICBOP-NEXT:    addi a1, a1, -1
 ; RV32ZICBOP-NEXT:    prefetch.i 0(a1)
 ; RV32ZICBOP-NEXT:    addi a1, a0, 2047
@@ -1439,14 +1337,11 @@ define void @test_prefetch_offsetable_instruction_locality_3(ptr %a) nounwind {
 ;
 ; RV64ZICBOP-LABEL: test_prefetch_offsetable_instruction_locality_3:
 ; RV64ZICBOP:       # %bb.0:
+; RV64ZICBOP-NEXT:    prefetch.i -2048(a0)
+; RV64ZICBOP-NEXT:    prefetch.i 2016(a0)
+; RV64ZICBOP-NEXT:    prefetch.i 32(a0)
+; RV64ZICBOP-NEXT:    prefetch.i -32(a0)
 ; RV64ZICBOP-NEXT:    addi a1, a0, -2048
-; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
-; RV64ZICBOP-NEXT:    addi a2, a0, 2016
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, 32
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOP-NEXT:    addi a2, a0, -32
-; RV64ZICBOP-NEXT:    prefetch.i 0(a2)
 ; RV64ZICBOP-NEXT:    addi a1, a1, -1
 ; RV64ZICBOP-NEXT:    prefetch.i 0(a1)
 ; RV64ZICBOP-NEXT:    addi a1, a0, 2047
@@ -1460,14 +1355,11 @@ define void @test_prefetch_offsetable_instruction_locality_3(ptr %a) nounwind {
 ;
 ; RV64ZICBOPZIHINTNTL-LABEL: test_prefetch_offsetable_instruction_locality_3:
 ; RV64ZICBOPZIHINTNTL:       # %bb.0:
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i -2048(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 2016(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 32(a0)
+; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i -32(a0)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, -2048
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 2016
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, 32
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
-; RV64ZICBOPZIHINTNTL-NEXT:    addi a2, a0, -32
-; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a2)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a1, -1
 ; RV64ZICBOPZIHINTNTL-NEXT:    prefetch.i 0(a1)
 ; RV64ZICBOPZIHINTNTL-NEXT:    addi a1, a0, 2047

>From 603677593e1210b66ae75c7d112deb2486acd8f2 Mon Sep 17 00:00:00 2001
From: wangpc <wangpengcheng.pp at bytedance.com>
Date: Mon, 18 Sep 2023 17:56:57 +0800
Subject: [PATCH 3/3] fixup! [RISCV] Match prefetch address with offset

Don't use `SelectAddrRegImm` in `SelectAddrRegImmLsb00000`, we add
an argument `IsPrefetch` to handle the cases directly instead.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 59 +++++++++++----------
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h   |  7 ++-
 2 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 6ff7aebf13aa573..303cdcdb088abbc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2228,7 +2228,8 @@ bool RISCVDAGToDAGISel::SelectFrameAddrRegImm(SDValue Addr, SDValue &Base,
 // Fold constant addresses.
 static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL,
                                const MVT VT, const RISCVSubtarget *Subtarget,
-                               SDValue Addr, SDValue &Base, SDValue &Offset) {
+                               SDValue Addr, SDValue &Base, SDValue &Offset,
+                               bool IsPrefetch = false) {
   if (!isa<ConstantSDNode>(Addr))
     return false;
 
@@ -2240,6 +2241,9 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL,
   int64_t Lo12 = SignExtend64<12>(CVal);
   int64_t Hi = (uint64_t)CVal - (uint64_t)Lo12;
   if (!Subtarget->is64Bit() || isInt<32>(Hi)) {
+    if (IsPrefetch && (Lo12 & 0b11111) != 0)
+      return false;
+
     if (Hi) {
       int64_t Hi20 = (Hi >> 12) & 0xfffff;
       Base = SDValue(
@@ -2262,6 +2266,8 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL,
   if (Seq.back().getOpcode() != RISCV::ADDI)
     return false;
   Lo12 = Seq.back().getImm();
+  if (IsPrefetch && (Lo12 & 0b11111) != 0)
+    return false;
 
   // Drop the last instruction.
   Seq.pop_back();
@@ -2351,7 +2357,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
 }
 
 bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
-                                         SDValue &Offset, bool IsINX) {
+                                         SDValue &Offset, bool IsINX,
+                                         bool IsPrefetch) {
   if (SelectAddrFrameIndex(Addr, Base, Offset))
     return true;
 
@@ -2380,8 +2387,9 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
           const DataLayout &DL = CurDAG->getDataLayout();
           Align Alignment = commonAlignment(
               GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
-          if (CVal == 0 || Alignment > CVal) {
-            int64_t CombinedOffset = CVal + GA->getOffset();
+          int64_t CombinedOffset = CVal + GA->getOffset();
+          if ((!IsPrefetch || (CombinedOffset & 0b11111) == 0) &&
+              (CVal == 0 || Alignment > CVal)) {
             Base = Base.getOperand(0);
             Offset = CurDAG->getTargetGlobalAddress(
                 GA->getGlobal(), SDLoc(LoOperand), LoOperand.getValueType(),
@@ -2391,6 +2399,13 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
         }
       }
 
+      // Early-out if not a valid offset for Zicbop.
+      if (IsPrefetch && (CVal & 0b11111) != 0) {
+        Base = Addr;
+        Offset = CurDAG->getTargetConstant(0, DL, VT);
+        return true;
+      }
+
       if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
       Offset = CurDAG->getTargetConstant(CVal, DL, VT);
@@ -2409,12 +2424,15 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
     // This mirrors the AddiPair PatFrag in RISCVInstrInfo.td.
     if (isInt<12>(CVal / 2) && isInt<12>(CVal - CVal / 2)) {
       int64_t Adj = CVal < 0 ? -2048 : 2047;
-      Base = SDValue(
-          CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0),
-                                 CurDAG->getTargetConstant(Adj, DL, VT)),
-          0);
-      Offset = CurDAG->getTargetConstant(CVal - Adj, DL, VT);
-      return true;
+      int64_t AdjustedOffset = CVal - Adj;
+      if (!IsPrefetch || (AdjustedOffset & 0b11111) == 0) {
+        Base = SDValue(
+            CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0),
+                                   CurDAG->getTargetConstant(Adj, DL, VT)),
+            0);
+        Offset = CurDAG->getTargetConstant(AdjustedOffset, DL, VT);
+        return true;
+      }
     }
 
     // For larger immediates, we might be able to save one instruction from
@@ -2425,7 +2443,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
     // instructions.
     if (isWorthFoldingAdd(Addr) &&
         selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr.getOperand(1), Base,
-                           Offset)) {
+                           Offset, IsPrefetch)) {
       // Insert an ADD instruction with the materialized Hi52 bits.
       Base = SDValue(
           CurDAG->getMachineNode(RISCV::ADD, DL, VT, Addr.getOperand(0), Base),
@@ -2434,7 +2452,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
     }
   }
 
-  if (selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr, Base, Offset))
+  if (selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr, Base, Offset,
+                         IsPrefetch))
     return true;
 
   Base = Addr;
@@ -2442,22 +2461,6 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
   return true;
 }
 
-bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
-                                                 SDValue &Offset) {
-
-  bool Found = SelectAddrRegImm(Addr, Base, Offset);
-  assert(Found && "SelectAddrRegImm should always succeed");
-  (void)Found;
-
-  if (auto *OffsetImm = dyn_cast<ConstantSDNode>(Offset))
-    if (OffsetImm->getAPIntValue().countTrailingZeros() >= 5)
-      return true;
-
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), Addr.getSimpleValueType());
-  return true;
-}
-
 bool RISCVDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
                                         SDValue &ShAmt) {
   ShAmt = N;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 675ab4e74c8f644..38f33d0786bde48 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -50,11 +50,14 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectFrameAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset,
-                        bool IsINX = false);
+                        bool IsINX = false, bool IsPrefetch=false);
   bool SelectAddrRegImmINX(SDValue Addr, SDValue &Base, SDValue &Offset) {
     return SelectAddrRegImm(Addr, Base, Offset, true);
   }
-  bool SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, SDValue &Offset) {
+    return SelectAddrRegImm(Addr, Base, Offset, /*IsINX=*/false,
+                            /*IsPrefetch=*/true);
+  }
 
   bool SelectAddrRegRegScale(SDValue Addr, unsigned MaxShiftAmount,
                              SDValue &Base, SDValue &Index, SDValue &Scale);



More information about the llvm-commits mailing list