<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/113010>113010</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
clang-cl retrieves TLS base address twice when __tls_guard has been set to true
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
mcfi
</td>
</tr>
</table>
<pre>
Consider this example https://godbolt.org/z/nGK9cn6TW
```
struct A {
int a;
};
extern __declspec(thread) struct A *a;
struct A*
getA(void)
{
return a;
}
```
clang-cl 19.1.0 generates the following when targeting x64
```
getA: # @getA
sub rsp, 40
.seh_stackalloc 40
.seh_endprologue
mov eax, dword ptr [rip + _tls_index]
mov ecx, eax
mov rax, qword ptr gs:[88]
mov rax, qword ptr [rax + 8*rcx]
mov al, byte ptr [rax + __tls_guard@SECREL32]
cmp al, 0
jne .LBB0_2
# %bb.1:
call __dyn_tls_on_demand_init
.LBB0_2:
mov eax, dword ptr [rip + _tls_index]
mov ecx, eax
mov rax, qword ptr gs:[88]
mov rax, qword ptr [rax + 8*rcx]
lea rax, [rax + "?a@@3PEAUA@@EA"@SECREL32]
mov rax, qword ptr [rax]
add rsp, 40
ret
```
and below when targeting arm64
```
getA: // @getA
.seh_proc getA
// %bb.0:
str x30, [sp, #-16]! // 8-byte Folded Spill
.seh_save_reg_x x30, 16
.seh_endprologue
ldr x8, [x18, #88]
adrp x9, _tls_index
ldr w9, [x9, :lo12:_tls_index]
// kill: def $x9 killed $w9
ldr x8, [x8, x9, lsl #3]
add x8, x8, :secrel_hi12:__tls_guard
ldrb w8, [x8, :secrel_lo12:__tls_guard]
cbnz w8, .LBB0_2
b .LBB0_1
.LBB0_1:
bl __dyn_tls_on_demand_init
b .LBB0_2
.LBB0_2:
ldr x8, [x18, #88]
adrp x9, _tls_index
ldr w9, [x9, :lo12:_tls_index]
// kill: def $x9 killed $w9
ldr x8, [x8, x9, lsl #3]
add x8, x8, :secrel_hi12:"?a@@3PEAUA@@EA"
ldr x0, [x8, :secrel_lo12:"?a@@3PEAUA@@EA"]
.seh_startepilogue
ldr x30, [sp], #16 // 8-byte Folded Reload
.seh_save_reg_x x30, 16
.seh_endepilogue
ret
```
In both cases, the base of TLS is retrieved again if __tls_guard if already set to true, which is the common case. Ideally, the (x64) code gen would look like below and Arm64 code gen would be similarly optimized.
```
getA: # @getA
sub rsp, 40
.seh_stackalloc 40
.seh_endprologue
.LBB0_1:
mov eax, dword ptr [rip + _tls_index]
mov ecx, eax
mov rax, qword ptr gs:[88]
mov rax, qword ptr [rax + 8*rcx]
mov al, byte ptr [rax + __tls_guard@SECREL32]
cmp al, 0
je .LBB0_2
lea rax, [rax + "?a@@3PEAUA@@EA"@SECREL32]
mov rax, qword ptr [rax]
add rsp, 40
ret
LBB0_2:
call __dyn_tls_on_demand_init
jmp .LBB0_1
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzkV02PozgT_jXOpdSRMSSBQw7JdOfV6J3DamZWe0QGV8DTxmZt00nPr1-ZkIR89PRqpdXuaFstgj_qqSpTz0PBnZOVRlyS2ZrMHie887Wxy6bcyklhxOvyg9FOCrTga-kA97xpFULtfetIvCJsQ9imMqIwyk-NrQjbfCdso__3_6zU86-_EfpI6IrM6fDfD523XelhBWSxPswAAEjtgZN4mCGLx_N9f8W9R6shzwWWyrVYEpb62iIXhGVwxmQrfmV5XCNsmKjQrwhLX4wMtkeHo1gs-s7qq3DupVIqrquHUkGUTaMphQo1Wu7Rga8RtkYps5O6gl2NGjy3Ffow3M-Tu3B9YHFIIgaS0H44JJK5riA0s64l7AMk9DQ_dVjnzvPymStlypsl1KK1Rpmqw9NCY14IzZDvA5bYGSug9RbIbG1lC4StIffK5VIL3JPZ47Vd2dsF88sFewD8_QRY9UUyW6fpLcrN5uCd73vvKWErW95xzVWwKV49XpvkfcRVx60gCf3y9OHz06eYjRHKpj0hnM_om8ZwUp_Wa5qzYTacPpsVxTQK4Z_suVKEZnkuXnXvzOhcYMO1yKWW_rDvCDSy-wkPWyE_G422EsZIvOEkoSSh8S9Pq19Xh_unVVi6f-4_jGC8kQtxt8It-rts4VpAgcrsrunFbfMuwYJwXXKsZ0trTQkj3g37-mqg46fqvCU028d0OKJD1ITFD9E8JMUiuPobsNKHvnw3RgkU8KWVoaouuMxfMLdY5fuzg2j-PqmV6CNKh4D2UTpEdFkQXNjAg30WVkeVdwmzy44wh5t4pUwUyvpOrR4zew6pxCsQuAXCkn3Wz6AIg132Zpz9zcGNcioEHN9WxbArHYJxWFpUeS0PMY24P_IS9HJ36eZsekxnZDrWikJ_P1lfigPNipNgRGPSX4hF8b5UXCCxN-Xjv_tY31GbK8f0xw_6HbBRZMcXqvXYyrsUG5M-UL1_HNH8mvBvUP4zKsPFX-L8TUBvSeNHDYXxNZTcoQtgoR0puEMwW_j66QtIF5ocK_EFBfCKSw1yO36JhiFXob16BYcevAFvOwxYu1qWdUAIoKVpGqN7R1P4KJAr9Xp0SFgaOh2WQWkEhuYIdqZTApQxz6DkMw76HZR8FUT7emOB4GQjFbfqFUzrZSO_o5j-c83THar_hO_3v62ZQrhppv693UR_vRXcP9XnHRn-rWnh4l1wKsqJWMYiizM-wWW0YBnL5tliMamXIpotaMKzlOM8mkViPhc0mReCxrNFgQs2kUtGWRLRKKM0mkeL6QxTmizidLtNynhRhCPChks1VeqlCZ9dE-lch8soimlEJ4oXqFz_SceYxh30qweVm9hlMHoousqRhCrpvDvDeOkVLk8fNUeFcL1k9PLBhbDoHPidLPHQd41Fo-YOCkQ9loxJZ9Xy6otR-rorpqVpCNsE78PPQ2vNNyw9YZs-ZkfYZkjqZcn-CAAA__8fTRuP">