; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s

declare hidden void @external_void_func_i1(i1) #0
declare hidden void @external_void_func_i1_signext(i1 signext) #0
declare hidden void @external_void_func_i1_zeroext(i1 zeroext) #0

declare hidden void @external_void_func_i8(i8) #0
declare hidden void @external_void_func_i8_signext(i8 signext) #0
declare hidden void @external_void_func_i8_zeroext(i8 zeroext) #0

declare hidden void @external_void_func_i16(i16) #0
declare hidden void @external_void_func_i16_signext(i16 signext) #0
declare hidden void @external_void_func_i16_zeroext(i16 zeroext) #0

declare hidden void @external_void_func_i32(i32) #0
declare hidden void @external_void_func_i64(i64) #0
declare hidden void @external_void_func_v2i64(<2 x i64>) #0
declare hidden void @external_void_func_v3i64(<3 x i64>) #0
declare hidden void @external_void_func_v4i64(<4 x i64>) #0

declare hidden void @external_void_func_f16(half) #0
declare hidden void @external_void_func_f32(float) #0
declare hidden void @external_void_func_f64(double) #0
declare hidden void @external_void_func_v2f32(<2 x float>) #0
declare hidden void @external_void_func_v2f64(<2 x double>) #0
declare hidden void @external_void_func_v3f32(<3 x float>) #0
declare hidden void @external_void_func_v3f64(<3 x double>) #0
declare hidden void @external_void_func_v5f32(<5 x float>) #0

declare hidden void @external_void_func_v2i16(<2 x i16>) #0
declare hidden void @external_void_func_v2f16(<2 x half>) #0
declare hidden void @external_void_func_v3i16(<3 x i16>) #0
declare hidden void @external_void_func_v3f16(<3 x half>) #0
declare hidden void @external_void_func_v4i16(<4 x i16>) #0
declare hidden void @external_void_func_v4f16(<4 x half>) #0

declare hidden void @external_void_func_v2i32(<2 x i32>) #0
declare hidden void @external_void_func_v3i32(<3 x i32>) #0
declare hidden void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
declare hidden void @external_void_func_v4i32(<4 x i32>) #0
declare hidden void @external_void_func_v5i32(<5 x i32>) #0
declare hidden void @external_void_func_v8i32(<8 x i32>) #0
declare hidden void @external_void_func_v16i32(<16 x i32>) #0
declare hidden void @external_void_func_v32i32(<32 x i32>) #0
declare hidden void @external_void_func_v32i32_i32(<32 x i32>, i32) #0

; return value and argument
declare hidden i32 @external_i32_func_i32(i32) #0

; Structs
declare hidden void @external_void_func_struct_i8_i32({ i8, i32 }) #0
declare hidden void @external_void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 })) #0
declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }), ptr addrspace(5) byval({ i8, i32 })) #0

declare hidden void @external_void_func_v16i8(<16 x i8>) #0

; FIXME: Should be passing -1
define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; VI-LABEL: test_call_external_void_func_i1_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i1_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i1_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v0, 1
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i1_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_i1(i1 true)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; VI-LABEL: test_call_external_void_func_i1_signext:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    v_bfe_i32 v0, v0, 0, 1
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i1_signext:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    v_bfe_i32 v0, v0, 0, 1
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i1_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_signext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i1_signext:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    v_bfe_i32 v0, v0, 0, 1
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %var = load volatile i1, ptr addrspace(1) poison
  call void @external_void_func_i1_signext(i1 signext %var)
  ret void
}

; FIXME: load should be scheduled before getpc
define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; VI-LABEL: test_call_external_void_func_i1_zeroext:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    v_and_b32_e32 v0, 1, v0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i1_zeroext:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    v_and_b32_e32 v0, 1, v0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i1_zeroext:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1_zeroext@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_zeroext@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    v_and_b32_e32 v0, 1, v0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %var = load volatile i1, ptr addrspace(1) poison
  call void @external_void_func_i1_zeroext(i1 zeroext %var)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; VI-LABEL: test_call_external_void_func_i8_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i8_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i8_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i8_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i8_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_i8(i8 123)
  ret void
}

; FIXME: don't wait before call
define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; VI-LABEL: test_call_external_void_func_i8_signext:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i8_signext:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i8_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i8_signext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_i8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_signext@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_signext@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i8_signext:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 glc
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8_signext@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8_signext@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %var = load volatile i8, ptr addrspace(1) poison
  call void @external_void_func_i8_signext(i8 signext %var)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; VI-LABEL: test_call_external_void_func_i8_zeroext:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i8_zeroext:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_zeroext@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_zeroext@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i8_zeroext:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8_zeroext@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8_zeroext@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %var = load volatile i8, ptr addrspace(1) poison
  call void @external_void_func_i8_zeroext(i8 zeroext %var)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; VI-LABEL: test_call_external_void_func_i16_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i16_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i16_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_i16(i16 123)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; VI-LABEL: test_call_external_void_func_i16_signext:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i16_signext:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i16_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i16_signext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_i16 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_signext@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_signext@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i16_signext:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_sshort v0, off, s[4:7], 0 glc
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16_signext@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16_signext@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %var = load volatile i16, ptr addrspace(1) poison
  call void @external_void_func_i16_signext(i16 signext %var)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; VI-LABEL: test_call_external_void_func_i16_zeroext:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i16_zeroext:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_zeroext@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_zeroext@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i16_zeroext:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16_zeroext@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16_zeroext@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %var = load volatile i16, ptr addrspace(1) poison
  call void @external_void_func_i16_zeroext(i16 zeroext %var)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; VI-LABEL: test_call_external_void_func_i32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 42
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 42
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 42
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v0, 42
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 42
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_i32(i32 42)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; VI-LABEL: test_call_external_void_func_i64_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
; VI-NEXT:    v_mov_b32_e32 v1, 0
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_i64_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
; CI-NEXT:    v_mov_b32_e32 v1, 0
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i64_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i64@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i64@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
; HSA-NEXT:    v_mov_b32_e32 v1, 0
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_i64(i64 123)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; VI-LABEL: test_call_external_void_func_v2i64:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], 0
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v2i64:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], 0
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v2i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b64 s[4:5], 0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v2i64:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_mov_b64 s[8:9], 0
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = load <2 x i64>, ptr addrspace(1) null
  call void @external_void_func_v2i64(<2 x i64> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; VI-LABEL: test_call_external_void_func_v2i64_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1
; VI-NEXT:    v_mov_b32_e32 v1, 2
; VI-NEXT:    v_mov_b32_e32 v2, 3
; VI-NEXT:    v_mov_b32_e32 v3, 4
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v2i64_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1
; CI-NEXT:    v_mov_b32_e32 v1, 2
; CI-NEXT:    v_mov_b32_e32 v2, 3
; CI-NEXT:    v_mov_b32_e32 v3, 4
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v2i64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    v_mov_b32_e32 v2, 3
; GFX9-NEXT:    v_mov_b32_e32 v3, 4
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v2i64_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1
; HSA-NEXT:    v_mov_b32_e32 v1, 2
; HSA-NEXT:    v_mov_b32_e32 v2, 3
; HSA-NEXT:    v_mov_b32_e32 v3, 4
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; VI-LABEL: test_call_external_void_func_v3i64:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], 0
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v4, 1
; VI-NEXT:    v_mov_b32_e32 v5, 2
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3i64:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], 0
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v4, 1
; CI-NEXT:    v_mov_b32_e32 v5, 2
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v4, 1
; GFX9-NEXT:    v_mov_b32_e32 v5, 2
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b64 s[4:5], 0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i64@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i64@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3i64:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_mov_b64 s[8:9], 0
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i64@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i64@rel32@hi+12
; HSA-NEXT:    v_mov_b32_e32 v4, 1
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v5, 2
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %load = load <2 x i64>, ptr addrspace(1) null
  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 poison>, <3 x i32> <i32 0, i32 1, i32 2>

  call void @external_void_func_v3i64(<3 x i64> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; VI-LABEL: test_call_external_void_func_v4i64:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], 0
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v4, 1
; VI-NEXT:    v_mov_b32_e32 v5, 2
; VI-NEXT:    v_mov_b32_e32 v6, 3
; VI-NEXT:    v_mov_b32_e32 v7, 4
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v4i64:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], 0
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v4, 1
; CI-NEXT:    v_mov_b32_e32 v5, 2
; CI-NEXT:    v_mov_b32_e32 v6, 3
; CI-NEXT:    v_mov_b32_e32 v7, 4
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v4i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v4, 1
; GFX9-NEXT:    v_mov_b32_e32 v5, 2
; GFX9-NEXT:    v_mov_b32_e32 v6, 3
; GFX9-NEXT:    v_mov_b32_e32 v7, 4
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b64 s[4:5], 0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT:    v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i64@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i64@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v4i64:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_mov_b64 s[8:9], 0
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i64@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i64@rel32@hi+12
; HSA-NEXT:    v_mov_b32_e32 v4, 1
; HSA-NEXT:    v_mov_b32_e32 v5, 2
; HSA-NEXT:    v_mov_b32_e32 v6, 3
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v7, 4
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %load = load <2 x i64>, ptr addrspace(1) null
  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  call void @external_void_func_v4i64(<4 x i64> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; VI-LABEL: test_call_external_void_func_f16_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_f16_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 4.0
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_f16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_f16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4400
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_f16_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0x4400
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_f16(half 4.0)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; VI-LABEL: test_call_external_void_func_f32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 4.0
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_f32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 4.0
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_f32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_f32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v0, 4.0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_f32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 4.0
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_f32(float 4.0)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; VI-LABEL: test_call_external_void_func_v2f32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v2f32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1.0
; CI-NEXT:    v_mov_b32_e32 v1, 2.0
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v2f32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2f32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v2f32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; VI-LABEL: test_call_external_void_func_v3f32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-NEXT:    v_mov_b32_e32 v2, 4.0
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3f32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1.0
; CI-NEXT:    v_mov_b32_e32 v1, 2.0
; CI-NEXT:    v_mov_b32_e32 v2, 4.0
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3f32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3f32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3f32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; VI-LABEL: test_call_external_void_func_v5f32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1.0
; VI-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-NEXT:    v_mov_b32_e32 v2, 4.0
; VI-NEXT:    v_mov_b32_e32 v3, -1.0
; VI-NEXT:    v_mov_b32_e32 v4, 0.5
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v5f32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1.0
; CI-NEXT:    v_mov_b32_e32 v1, 2.0
; CI-NEXT:    v_mov_b32_e32 v2, 4.0
; CI-NEXT:    v_mov_b32_e32 v3, -1.0
; CI-NEXT:    v_mov_b32_e32 v4, 0.5
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v5f32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v5f32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
; GFX11-NEXT:    v_mov_b32_e32 v4, 0.5
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5f32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5f32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v5f32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v5f32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v5f32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
; HSA-NEXT:    v_mov_b32_e32 v3, -1.0
; HSA-NEXT:    v_mov_b32_e32 v4, 0.5
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; VI-LABEL: test_call_external_void_func_f64_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0
; VI-NEXT:    v_mov_b32_e32 v1, 0x40100000
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_f64_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 0
; CI-NEXT:    v_mov_b32_e32 v1, 0x40100000
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_f64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_f64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f64@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f64@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_f64_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f64@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f64@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0
; HSA-NEXT:    v_mov_b32_e32 v1, 0x40100000
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_f64(double 4.0)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; VI-LABEL: test_call_external_void_func_v2f64_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0
; VI-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-NEXT:    v_mov_b32_e32 v2, 0
; VI-NEXT:    v_mov_b32_e32 v3, 0x40100000
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v2f64_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 0
; CI-NEXT:    v_mov_b32_e32 v1, 2.0
; CI-NEXT:    v_mov_b32_e32 v2, 0
; CI-NEXT:    v_mov_b32_e32 v3, 0x40100000
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v2f64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2f64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f64@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f64@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v2f64_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f64@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f64@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0
; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
; HSA-NEXT:    v_mov_b32_e32 v2, 0
; HSA-NEXT:    v_mov_b32_e32 v3, 0x40100000
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; VI-LABEL: test_call_external_void_func_v3f64_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0
; VI-NEXT:    v_mov_b32_e32 v1, 2.0
; VI-NEXT:    v_mov_b32_e32 v2, 0
; VI-NEXT:    v_mov_b32_e32 v3, 0x40100000
; VI-NEXT:    v_mov_b32_e32 v4, 0
; VI-NEXT:    v_mov_b32_e32 v5, 0x40200000
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3f64_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 0
; CI-NEXT:    v_mov_b32_e32 v1, 2.0
; CI-NEXT:    v_mov_b32_e32 v2, 0
; CI-NEXT:    v_mov_b32_e32 v3, 0x40100000
; CI-NEXT:    v_mov_b32_e32 v4, 0
; CI-NEXT:    v_mov_b32_e32 v5, 0x40200000
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3f64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
; GFX9-NEXT:    v_mov_b32_e32 v4, 0
; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3f64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f64@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f64@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3f64_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f64@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f64@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0
; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
; HSA-NEXT:    v_mov_b32_e32 v2, 0
; HSA-NEXT:    v_mov_b32_e32 v3, 0x40100000
; HSA-NEXT:    v_mov_b32_e32 v4, 0
; HSA-NEXT:    v_mov_b32_e32 v5, 0x40200000
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; VI-LABEL: test_call_external_void_func_v2i16:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v2i16:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v2i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i16@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v2i16:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = load <2 x i16>, ptr addrspace(1) poison
  call void @external_void_func_v2i16(<2 x i16> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; VI-LABEL: test_call_external_void_func_v3i16:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3i16:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
; CI-NEXT:    v_mov_b32_e32 v0, v2
; CI-NEXT:    v_mov_b32_e32 v2, v3
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3i16:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = load <3 x i16>, ptr addrspace(1) poison
  call void @external_void_func_v3i16(<3 x i16> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; VI-LABEL: test_call_external_void_func_v3f16:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3f16:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dwordx2 v[1:2], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3f16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3f16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3f16:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = load <3 x half>, ptr addrspace(1) poison
  call void @external_void_func_v3f16(<3 x half> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; VI-LABEL: test_call_external_void_func_v3i16_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0x20001
; VI-NEXT:    v_mov_b32_e32 v1, 3
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3i16_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1
; CI-NEXT:    v_mov_b32_e32 v1, 2
; CI-NEXT:    v_mov_b32_e32 v2, 3
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3i16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT:    v_mov_b32_e32 v1, 3
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3i16_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
; HSA-NEXT:    v_mov_b32_e32 v1, 3
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; VI-LABEL: test_call_external_void_func_v3f16_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0x40003c00
; VI-NEXT:    v_mov_b32_e32 v1, 0x4400
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3f16_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1.0
; CI-NEXT:    v_mov_b32_e32 v1, 2.0
; CI-NEXT:    v_mov_b32_e32 v2, 4.0
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3f16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3f16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40003c00
; GFX11-NEXT:    v_mov_b32_e32 v1, 0x4400
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3f16_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0x40003c00
; HSA-NEXT:    v_mov_b32_e32 v1, 0x4400
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; VI-LABEL: test_call_external_void_func_v4i16:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v4i16:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
; CI-NEXT:    v_mov_b32_e32 v2, v1
; CI-NEXT:    v_mov_b32_e32 v1, v4
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v4i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v4i16:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = load <4 x i16>, ptr addrspace(1) poison
  call void @external_void_func_v4i16(<4 x i16> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; VI-LABEL: test_call_external_void_func_v4i16_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 0x20001
; VI-NEXT:    v_mov_b32_e32 v1, 0x40003
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v4i16_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1
; CI-NEXT:    v_mov_b32_e32 v1, 2
; CI-NEXT:    v_mov_b32_e32 v2, 3
; CI-NEXT:    v_mov_b32_e32 v3, 4
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v4i16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT:    v_mov_b32_e32 v1, 0x40003
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v4i16_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
; HSA-NEXT:    v_mov_b32_e32 v1, 0x40003
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; VI-LABEL: test_call_external_void_func_v2f16:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v2f16:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v2f16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2f16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f16@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f16@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v2f16:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f16@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f16@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = load <2 x half>, ptr addrspace(1) poison
  call void @external_void_func_v2f16(<2 x half> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; VI-LABEL: test_call_external_void_func_v2i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v2i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v2i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v2i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = load <2 x i32>, ptr addrspace(1) poison
  call void @external_void_func_v2i32(<2 x i32> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; VI-LABEL: test_call_external_void_func_v2i32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1
; VI-NEXT:    v_mov_b32_e32 v1, 2
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v2i32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1
; CI-NEXT:    v_mov_b32_e32 v1, 2
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v2i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v2i32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1
; HSA-NEXT:    v_mov_b32_e32 v1, 2
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; VI-LABEL: test_call_external_void_func_v3i32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 3
; VI-NEXT:    v_mov_b32_e32 v1, 4
; VI-NEXT:    v_mov_b32_e32 v2, 5
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3i32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 3
; CI-NEXT:    v_mov_b32_e32 v1, 4
; CI-NEXT:    v_mov_b32_e32 v2, 5
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    v_mov_b32_e32 v1, 4
; GFX9-NEXT:    v_mov_b32_e32 v2, 5
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT:    v_mov_b32_e32 v2, 5
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3i32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 3
; HSA-NEXT:    v_mov_b32_e32 v1, 4
; HSA-NEXT:    v_mov_b32_e32 v2, 5
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; VI-LABEL: test_call_external_void_func_v3i32_i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 3
; VI-NEXT:    v_mov_b32_e32 v1, 4
; VI-NEXT:    v_mov_b32_e32 v2, 5
; VI-NEXT:    v_mov_b32_e32 v3, 6
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v3i32_i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 3
; CI-NEXT:    v_mov_b32_e32 v1, 4
; CI-NEXT:    v_mov_b32_e32 v2, 5
; CI-NEXT:    v_mov_b32_e32 v3, 6
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v3i32_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    v_mov_b32_e32 v1, 4
; GFX9-NEXT:    v_mov_b32_e32 v2, 5
; GFX9-NEXT:    v_mov_b32_e32 v3, 6
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i32_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT:    v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32_i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32_i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v3i32_i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 3
; HSA-NEXT:    v_mov_b32_e32 v1, 4
; HSA-NEXT:    v_mov_b32_e32 v2, 5
; HSA-NEXT:    v_mov_b32_e32 v3, 6
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; VI-LABEL: test_call_external_void_func_v4i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v4i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v4i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v4i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = load <4 x i32>, ptr addrspace(1) poison
  call void @external_void_func_v4i32(<4 x i32> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; VI-LABEL: test_call_external_void_func_v4i32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1
; VI-NEXT:    v_mov_b32_e32 v1, 2
; VI-NEXT:    v_mov_b32_e32 v2, 3
; VI-NEXT:    v_mov_b32_e32 v3, 4
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v4i32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1
; CI-NEXT:    v_mov_b32_e32 v1, 2
; CI-NEXT:    v_mov_b32_e32 v2, 3
; CI-NEXT:    v_mov_b32_e32 v3, 4
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v4i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    v_mov_b32_e32 v2, 3
; GFX9-NEXT:    v_mov_b32_e32 v3, 4
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v4i32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1
; HSA-NEXT:    v_mov_b32_e32 v1, 2
; HSA-NEXT:    v_mov_b32_e32 v2, 3
; HSA-NEXT:    v_mov_b32_e32 v3, 4
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; VI-LABEL: test_call_external_void_func_v5i32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1
; VI-NEXT:    v_mov_b32_e32 v1, 2
; VI-NEXT:    v_mov_b32_e32 v2, 3
; VI-NEXT:    v_mov_b32_e32 v3, 4
; VI-NEXT:    v_mov_b32_e32 v4, 5
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v5i32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1
; CI-NEXT:    v_mov_b32_e32 v1, 2
; CI-NEXT:    v_mov_b32_e32 v2, 3
; CI-NEXT:    v_mov_b32_e32 v3, 4
; CI-NEXT:    v_mov_b32_e32 v4, 5
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v5i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    v_mov_b32_e32 v2, 3
; GFX9-NEXT:    v_mov_b32_e32 v3, 4
; GFX9-NEXT:    v_mov_b32_e32 v4, 5
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v5i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT:    v_mov_b32_e32 v4, 5
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v5i32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v5i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v5i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1
; HSA-NEXT:    v_mov_b32_e32 v1, 2
; HSA-NEXT:    v_mov_b32_e32 v2, 3
; HSA-NEXT:    v_mov_b32_e32 v3, 4
; HSA-NEXT:    v_mov_b32_e32 v4, 5
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; VI-LABEL: test_call_external_void_func_v8i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v8i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v8i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v8i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v8i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_waitcnt lgkmcnt(0)
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %ptr = load ptr addrspace(1), ptr addrspace(4) poison
  %val = load <8 x i32>, ptr addrspace(1) %ptr
  call void @external_void_func_v8i32(<8 x i32> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; VI-LABEL: test_call_external_void_func_v8i32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    v_mov_b32_e32 v0, 1
; VI-NEXT:    v_mov_b32_e32 v1, 2
; VI-NEXT:    v_mov_b32_e32 v2, 3
; VI-NEXT:    v_mov_b32_e32 v3, 4
; VI-NEXT:    v_mov_b32_e32 v4, 5
; VI-NEXT:    v_mov_b32_e32 v5, 6
; VI-NEXT:    v_mov_b32_e32 v6, 7
; VI-NEXT:    v_mov_b32_e32 v7, 8
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v8i32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    v_mov_b32_e32 v0, 1
; CI-NEXT:    v_mov_b32_e32 v1, 2
; CI-NEXT:    v_mov_b32_e32 v2, 3
; CI-NEXT:    v_mov_b32_e32 v3, 4
; CI-NEXT:    v_mov_b32_e32 v4, 5
; CI-NEXT:    v_mov_b32_e32 v5, 6
; CI-NEXT:    v_mov_b32_e32 v6, 7
; CI-NEXT:    v_mov_b32_e32 v7, 8
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v8i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    v_mov_b32_e32 v2, 3
; GFX9-NEXT:    v_mov_b32_e32 v3, 4
; GFX9-NEXT:    v_mov_b32_e32 v4, 5
; GFX9-NEXT:    v_mov_b32_e32 v5, 6
; GFX9-NEXT:    v_mov_b32_e32 v6, 7
; GFX9-NEXT:    v_mov_b32_e32 v7, 8
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v8i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT:    v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
; GFX11-NEXT:    v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v8i32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 1
; HSA-NEXT:    v_mov_b32_e32 v1, 2
; HSA-NEXT:    v_mov_b32_e32 v2, 3
; HSA-NEXT:    v_mov_b32_e32 v3, 4
; HSA-NEXT:    v_mov_b32_e32 v4, 5
; HSA-NEXT:    v_mov_b32_e32 v5, 6
; HSA-NEXT:    v_mov_b32_e32 v6, 7
; HSA-NEXT:    v_mov_b32_e32 v7, 8
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; VI-LABEL: test_call_external_void_func_v16i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v16i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v16i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v16i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x3
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v16i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_waitcnt lgkmcnt(0)
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v16i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v16i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %ptr = load ptr addrspace(1), ptr addrspace(4) poison
  %val = load <16 x i32>, ptr addrspace(1) %ptr
  call void @external_void_func_v16i32(<16 x i32> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-LABEL: test_call_external_void_func_v32i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; VI-NEXT:    s_mov_b32 s7, 0xf000
; VI-NEXT:    s_mov_b32 s6, -1
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_getpc_b64 s[8:9]
; VI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_waitcnt vmcnt(7)
; VI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
; VI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v32i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s7, 0xf000
; CI-NEXT:    s_mov_b32 s6, -1
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_getpc_b64 s[8:9]
; CI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_waitcnt vmcnt(7)
; CI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
; CI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v32i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_mov_b32 s7, 0xf000
; GFX9-NEXT:    s_mov_b32 s6, -1
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_getpc_b64 s[8:9]
; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_waitcnt vmcnt(7)
; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v32i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32@rel32@hi+12
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x7
; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64
; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80
; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_waitcnt vmcnt(7)
; GFX11-NEXT:    scratch_store_b32 off, v31, s32
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v32i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_waitcnt lgkmcnt(0)
; HSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[12:13]
; HSA-NEXT:    s_add_u32 s12, s12, external_void_func_v32i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s13, s13, external_void_func_v32i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_waitcnt vmcnt(7)
; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
; HSA-NEXT:    s_swappc_b64 s[30:31], s[12:13]
; HSA-NEXT:    s_endpgm
  %ptr = load ptr addrspace(1), ptr addrspace(4) poison
  %val = load <32 x i32>, ptr addrspace(1) %ptr
  call void @external_void_func_v32i32(<32 x i32> %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; VI-LABEL: test_call_external_void_func_v32i32_i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; VI-NEXT:    s_mov_b32 s7, 0xf000
; VI-NEXT:    s_mov_b32 s6, -1
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    buffer_load_dword v32, off, s[4:7], 0
; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_waitcnt vmcnt(8)
; VI-NEXT:    buffer_store_dword v32, off, s[36:39], s32 offset:4
; VI-NEXT:    s_waitcnt vmcnt(8)
; VI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v32i32_i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s7, 0xf000
; CI-NEXT:    s_mov_b32 s6, -1
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    buffer_load_dword v32, off, s[4:7], 0
; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_waitcnt vmcnt(8)
; CI-NEXT:    buffer_store_dword v32, off, s[36:39], s32 offset:4
; CI-NEXT:    s_waitcnt vmcnt(8)
; CI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_mov_b32 s7, 0xf000
; GFX9-NEXT:    s_mov_b32 s6, -1
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    buffer_load_dword v32, off, s[4:7], 0
; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_waitcnt vmcnt(8)
; GFX9-NEXT:    buffer_store_dword v32, off, s[36:39], s32 offset:4
; GFX9-NEXT:    s_waitcnt vmcnt(8)
; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v32i32_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32_i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32_i32@rel32@hi+12
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x8
; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
; GFX11-NEXT:    buffer_load_b32 v32, off, s[4:7], 0
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64
; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80
; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_add_i32 s4, s32, 4
; GFX11-NEXT:    s_waitcnt vmcnt(8)
; GFX11-NEXT:    scratch_store_b32 off, v31, s32
; GFX11-NEXT:    s_waitcnt vmcnt(7)
; GFX11-NEXT:    scratch_store_b32 off, v32, s4
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v32i32_i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    s_waitcnt lgkmcnt(0)
; HSA-NEXT:    buffer_load_dword v32, off, s[8:11], 0
; HSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32_i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32_i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_waitcnt vmcnt(8)
; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
; HSA-NEXT:    s_waitcnt vmcnt(8)
; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %ptr0 = load ptr addrspace(1), ptr addrspace(4) poison
  %val0 = load <32 x i32>, ptr addrspace(1) %ptr0
  %val1 = load i32, ptr addrspace(1) poison
  call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
  ret void
}

define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 {
; VI-LABEL: test_call_external_i32_func_i32_imm:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s50, -1
; VI-NEXT:    s_mov_b32 s51, 0xe80000
; VI-NEXT:    s_add_u32 s48, s48, s5
; VI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
; VI-NEXT:    s_addc_u32 s49, s49, 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[48:49]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[50:51]
; VI-NEXT:    v_mov_b32_e32 v0, 42
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_mov_b32 s39, 0xf000
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_i32_func_i32_imm:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s50, -1
; CI-NEXT:    s_mov_b32 s51, 0xe8f000
; CI-NEXT:    s_add_u32 s48, s48, s5
; CI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
; CI-NEXT:    s_addc_u32 s49, s49, 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[48:49]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[50:51]
; CI-NEXT:    v_mov_b32_e32 v0, 42
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_mov_b32 s39, 0xf000
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_i32_func_i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s50, -1
; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
; GFX9-NEXT:    s_add_u32 s48, s48, s5
; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
; GFX9-NEXT:    s_addc_u32 s49, s49, 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT:    v_mov_b32_e32 v0, 42
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_mov_b32 s39, 0xf000
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_i32_func_i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[36:37], s[2:3], 0x24
; GFX11-NEXT:    v_mov_b32_e32 v0, 42
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_i32_func_i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_i32_func_i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_mov_b32 s39, 0x31016000
; GFX11-NEXT:    s_mov_b32 s38, -1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    buffer_store_b32 v0, off, s[36:39], 0 dlc
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_i32_func_i32_imm:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_load_dwordx2 s[36:37], s[6:7], 0x0
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_i32_func_i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_i32_func_i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, 42
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_mov_b32 s39, 0x1100f000
; HSA-NEXT:    s_mov_b32 s38, -1
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    buffer_store_dword v0, off, s[36:39], 0
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_endpgm
  %val = call i32 @external_i32_func_i32(i32 42)
  store volatile i32 %val, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; VI-LABEL: test_call_external_void_func_struct_i8_i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
; VI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_struct_i8_i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_struct_i8_i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_struct_i8_i32@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    buffer_load_u8 v0, off, s[4:7], 0
; GFX11-NEXT:    buffer_load_b32 v1, off, s[4:7], 0 offset:4
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_struct_i8_i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_waitcnt lgkmcnt(0)
; HSA-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
; HSA-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %ptr0 = load ptr addrspace(1), ptr addrspace(4) poison
  %val = load { i8, i32 }, ptr addrspace(1) %ptr0
  call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; VI-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    v_mov_b32_e32 v0, 3
; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
; VI-NEXT:    v_mov_b32_e32 v0, 8
; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_movk_i32 s32, 0x400
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_waitcnt vmcnt(1)
; VI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
; VI-NEXT:    s_waitcnt vmcnt(1)
; VI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    v_mov_b32_e32 v0, 3
; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
; CI-NEXT:    v_mov_b32_e32 v0, 8
; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_movk_i32 s32, 0x400
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_waitcnt vmcnt(1)
; CI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
; CI-NEXT:    s_waitcnt vmcnt(1)
; CI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_movk_i32 s32, 0x400
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_dword v1, off, s[36:39], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-TRUE16-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; GFX11-TRUE16:       ; %bb.0:
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 3
; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 16
; GFX11-TRUE16-NEXT:    s_getpc_b64 s[2:3]
; GFX11-TRUE16-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32@rel32@lo+4
; GFX11-TRUE16-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32@rel32@hi+12
; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-TRUE16-NEXT:    s_clause 0x1
; GFX11-TRUE16-NEXT:    scratch_store_b8 off, v0, off
; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v1, off offset:4
; GFX11-TRUE16-NEXT:    scratch_load_b64 v[0:1], off, off
; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s32
; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-TRUE16-NEXT:    s_endpgm
;
; GFX11-FAKE16-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; GFX11-FAKE16:       ; %bb.0:
; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 16
; GFX11-FAKE16-NEXT:    s_getpc_b64 s[2:3]
; GFX11-FAKE16-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32@rel32@lo+4
; GFX11-FAKE16-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32@rel32@hi+12
; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-FAKE16-NEXT:    s_clause 0x1
; GFX11-FAKE16-NEXT:    scratch_store_b8 off, v0, off
; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v1, off offset:4
; GFX11-FAKE16-NEXT:    scratch_load_b64 v[0:1], off, off
; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s32
; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-FAKE16-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    v_mov_b32_e32 v0, 3
; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; HSA-NEXT:    v_mov_b32_e32 v0, 8
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0
; HSA-NEXT:    s_movk_i32 s32, 0x400
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_byval_struct_i8_i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_byval_struct_i8_i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_waitcnt vmcnt(1)
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT:    s_waitcnt vmcnt(1)
; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %val = alloca { i8, i32 }, align 8, addrspace(5)
  %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0
  %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1
  store i8 3, ptr addrspace(5) %gep0
  store i32 8, ptr addrspace(5) %gep1
  call void @external_void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %val)
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
; VI-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s5
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    v_mov_b32_e32 v0, 3
; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
; VI-NEXT:    v_mov_b32_e32 v0, 8
; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
; VI-NEXT:    s_movk_i32 s32, 0x800
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_waitcnt vmcnt(1)
; VI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
; VI-NEXT:    s_waitcnt vmcnt(1)
; VI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
; VI-NEXT:    v_mov_b32_e32 v0, 8
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    s_waitcnt vmcnt(1)
; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s5
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    v_mov_b32_e32 v0, 3
; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
; CI-NEXT:    v_mov_b32_e32 v0, 8
; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
; CI-NEXT:    s_movk_i32 s32, 0x800
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_waitcnt vmcnt(1)
; CI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
; CI-NEXT:    s_waitcnt vmcnt(1)
; CI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
; CI-NEXT:    v_mov_b32_e32 v0, 8
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    s_waitcnt vmcnt(1)
; CI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s5
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0
; GFX9-NEXT:    s_movk_i32 s32, 0x800
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_dword v1, off, s[36:39], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_endpgm
;
; GFX11-TRUE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; GFX11-TRUE16:       ; %bb.0:
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 3
; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 32
; GFX11-TRUE16-NEXT:    s_getpc_b64 s[2:3]
; GFX11-TRUE16-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
; GFX11-TRUE16-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-TRUE16-NEXT:    s_clause 0x1
; GFX11-TRUE16-NEXT:    scratch_store_b8 off, v0, off
; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v1, off offset:4
; GFX11-TRUE16-NEXT:    scratch_load_b64 v[0:1], off, off
; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s32
; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 8
; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-TRUE16-NEXT:    s_clause 0x1
; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, off offset:8
; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, off offset:12
; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT:    s_nop 0
; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-TRUE16-NEXT:    s_endpgm
;
; GFX11-FAKE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; GFX11-FAKE16:       ; %bb.0:
; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 32
; GFX11-FAKE16-NEXT:    s_getpc_b64 s[2:3]
; GFX11-FAKE16-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
; GFX11-FAKE16-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-FAKE16-NEXT:    s_clause 0x1
; GFX11-FAKE16-NEXT:    scratch_store_b8 off, v0, off
; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v1, off offset:4
; GFX11-FAKE16-NEXT:    scratch_load_b64 v[0:1], off, off
; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s32
; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 8
; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-FAKE16-NEXT:    s_clause 0x1
; GFX11-FAKE16-NEXT:    scratch_load_u8 v0, off, off offset:8
; GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, off offset:12
; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT:    s_nop 0
; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    v_mov_b32_e32 v0, 3
; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0
; HSA-NEXT:    v_mov_b32_e32 v0, 8
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0
; HSA-NEXT:    s_movk_i32 s32, 0x800
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_waitcnt vmcnt(1)
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT:    s_waitcnt vmcnt(1)
; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
; HSA-NEXT:    v_mov_b32_e32 v0, 8
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:8
; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12
; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
; HSA-NEXT:    s_mov_b32 s6, -1
; HSA-NEXT:    s_waitcnt vmcnt(1)
; HSA-NEXT:    buffer_store_byte v0, off, s[4:7], 0
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    buffer_store_dword v1, off, s[4:7], 0
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_endpgm
  %in.val = alloca { i8, i32 }, align 8, addrspace(5)
  %out.val = alloca { i8, i32 }, align 8, addrspace(5)
  %in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0
  %in.gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 1
  store i8 3, ptr addrspace(5) %in.gep0
  store i32 8, ptr addrspace(5) %in.gep1
  call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) %out.val, ptr addrspace(5) byval({ i8, i32 }) %in.val)
  %out.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %out.val, i32 0, i32 0
  %out.gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %out.val, i32 0, i32 1
  %out.val0 = load i8, ptr addrspace(5) %out.gep0
  %out.val1 = load i32, ptr addrspace(5) %out.gep1

  store volatile i8 %out.val0, ptr addrspace(1) poison
  store volatile i32 %out.val1, ptr addrspace(1) poison
  ret void
}

define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; VI-LABEL: test_call_external_void_func_v16i8:
; VI:       ; %bb.0:
; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s38, -1
; VI-NEXT:    s_mov_b32 s39, 0xe80000
; VI-NEXT:    s_add_u32 s36, s36, s3
; VI-NEXT:    s_mov_b32 s3, 0xf000
; VI-NEXT:    s_mov_b32 s2, -1
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT:    s_addc_u32 s37, s37, 0
; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; VI-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; VI-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; VI-NEXT:    v_mov_b32_e32 v4, v1
; VI-NEXT:    v_mov_b32_e32 v8, v2
; VI-NEXT:    v_mov_b32_e32 v12, v3
; VI-NEXT:    v_mov_b32_e32 v1, v16
; VI-NEXT:    v_mov_b32_e32 v2, v17
; VI-NEXT:    v_mov_b32_e32 v3, v18
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: test_call_external_void_func_v16i8:
; CI:       ; %bb.0:
; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s38, -1
; CI-NEXT:    s_mov_b32 s39, 0xe8f000
; CI-NEXT:    s_add_u32 s36, s36, s3
; CI-NEXT:    s_mov_b32 s3, 0xf000
; CI-NEXT:    s_mov_b32 s2, -1
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT:    s_addc_u32 s37, s37, 0
; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; CI-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; CI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; CI-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; CI-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; CI-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; CI-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; CI-NEXT:    v_mov_b32_e32 v4, v1
; CI-NEXT:    v_mov_b32_e32 v8, v2
; CI-NEXT:    v_mov_b32_e32 v12, v3
; CI-NEXT:    v_mov_b32_e32 v1, v16
; CI-NEXT:    v_mov_b32_e32 v2, v17
; CI-NEXT:    v_mov_b32_e32 v3, v18
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_v16i8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s38, -1
; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
; GFX9-NEXT:    s_add_u32 s36, s36, s3
; GFX9-NEXT:    s_mov_b32 s3, 0xf000
; GFX9-NEXT:    s_mov_b32 s2, -1
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT:    s_addc_u32 s37, s37, 0
; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX9-NEXT:    v_mov_b32_e32 v4, v1
; GFX9-NEXT:    v_mov_b32_e32 v8, v2
; GFX9-NEXT:    v_mov_b32_e32 v12, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v16
; GFX9-NEXT:    v_mov_b32_e32 v2, v17
; GFX9-NEXT:    v_mov_b32_e32 v3, v18
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v16i8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
; GFX11-NEXT:    s_mov_b32 s6, -1
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i8@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i8@rel32@hi+12
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16
; GFX11-NEXT:    v_mov_b32_e32 v8, v2
; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
; GFX11-NEXT:    v_mov_b32_e32 v2, v17
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v16i8:
; HSA:       ; %bb.0:
; HSA-NEXT:    s_add_i32 s6, s6, s9
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT:    s_add_u32 s0, s0, s9
; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
; HSA-NEXT:    s_mov_b32 s10, -1
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT:    s_waitcnt lgkmcnt(0)
; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT:    s_getpc_b64 s[8:9]
; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v16i8@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v16i8@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; HSA-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; HSA-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; HSA-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; HSA-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; HSA-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; HSA-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; HSA-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; HSA-NEXT:    v_mov_b32_e32 v4, v1
; HSA-NEXT:    v_mov_b32_e32 v8, v2
; HSA-NEXT:    v_mov_b32_e32 v12, v3
; HSA-NEXT:    v_mov_b32_e32 v1, v16
; HSA-NEXT:    v_mov_b32_e32 v2, v17
; HSA-NEXT:    v_mov_b32_e32 v3, v18
; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT:    s_endpgm
  %ptr = load ptr addrspace(1), ptr addrspace(4) poison
  %val = load <16 x i8>, ptr addrspace(1) %ptr
  call void @external_void_func_v16i8(<16 x i8> %val)
  ret void
}

define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
; VI-LABEL: stack_passed_arg_alignment_v32i32_f64:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
; VI-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
; VI-NEXT:    s_mov_b32 s54, -1
; VI-NEXT:    s_mov_b32 s55, 0xe80000
; VI-NEXT:    s_add_u32 s52, s52, s5
; VI-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa4
; VI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
; VI-NEXT:    s_mov_b32 s32, 0
; VI-NEXT:    s_addc_u32 s53, s53, 0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v0, s23
; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32
; VI-NEXT:    v_mov_b32_e32 v0, s4
; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
; VI-NEXT:    v_mov_b32_e32 v0, s5
; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; VI-NEXT:    s_mov_b64 s[0:1], s[52:53]
; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
; VI-NEXT:    s_mov_b64 s[2:3], s[54:55]
; VI-NEXT:    v_mov_b32_e32 v0, s36
; VI-NEXT:    v_mov_b32_e32 v1, s37
; VI-NEXT:    v_mov_b32_e32 v2, s38
; VI-NEXT:    v_mov_b32_e32 v3, s39
; VI-NEXT:    v_mov_b32_e32 v4, s40
; VI-NEXT:    v_mov_b32_e32 v5, s41
; VI-NEXT:    v_mov_b32_e32 v6, s42
; VI-NEXT:    v_mov_b32_e32 v7, s43
; VI-NEXT:    v_mov_b32_e32 v8, s44
; VI-NEXT:    v_mov_b32_e32 v9, s45
; VI-NEXT:    v_mov_b32_e32 v10, s46
; VI-NEXT:    v_mov_b32_e32 v11, s47
; VI-NEXT:    v_mov_b32_e32 v12, s48
; VI-NEXT:    v_mov_b32_e32 v13, s49
; VI-NEXT:    v_mov_b32_e32 v14, s50
; VI-NEXT:    v_mov_b32_e32 v15, s51
; VI-NEXT:    v_mov_b32_e32 v16, s8
; VI-NEXT:    v_mov_b32_e32 v17, s9
; VI-NEXT:    v_mov_b32_e32 v18, s10
; VI-NEXT:    v_mov_b32_e32 v19, s11
; VI-NEXT:    v_mov_b32_e32 v20, s12
; VI-NEXT:    v_mov_b32_e32 v21, s13
; VI-NEXT:    v_mov_b32_e32 v22, s14
; VI-NEXT:    v_mov_b32_e32 v23, s15
; VI-NEXT:    v_mov_b32_e32 v24, s16
; VI-NEXT:    v_mov_b32_e32 v25, s17
; VI-NEXT:    v_mov_b32_e32 v26, s18
; VI-NEXT:    v_mov_b32_e32 v27, s19
; VI-NEXT:    v_mov_b32_e32 v28, s20
; VI-NEXT:    v_mov_b32_e32 v29, s21
; VI-NEXT:    v_mov_b32_e32 v30, s22
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    s_endpgm
;
; CI-LABEL: stack_passed_arg_alignment_v32i32_f64:
; CI:       ; %bb.0: ; %entry
; CI-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
; CI-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
; CI-NEXT:    s_mov_b32 s54, -1
; CI-NEXT:    s_mov_b32 s55, 0xe8f000
; CI-NEXT:    s_add_u32 s52, s52, s5
; CI-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x19
; CI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x29
; CI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x9
; CI-NEXT:    s_mov_b32 s32, 0
; CI-NEXT:    s_addc_u32 s53, s53, 0
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    v_mov_b32_e32 v0, s23
; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32
; CI-NEXT:    v_mov_b32_e32 v0, s4
; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
; CI-NEXT:    v_mov_b32_e32 v0, s5
; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
; CI-NEXT:    s_mov_b64 s[0:1], s[52:53]
; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
; CI-NEXT:    s_mov_b64 s[2:3], s[54:55]
; CI-NEXT:    v_mov_b32_e32 v0, s36
; CI-NEXT:    v_mov_b32_e32 v1, s37
; CI-NEXT:    v_mov_b32_e32 v2, s38
; CI-NEXT:    v_mov_b32_e32 v3, s39
; CI-NEXT:    v_mov_b32_e32 v4, s40
; CI-NEXT:    v_mov_b32_e32 v5, s41
; CI-NEXT:    v_mov_b32_e32 v6, s42
; CI-NEXT:    v_mov_b32_e32 v7, s43
; CI-NEXT:    v_mov_b32_e32 v8, s44
; CI-NEXT:    v_mov_b32_e32 v9, s45
; CI-NEXT:    v_mov_b32_e32 v10, s46
; CI-NEXT:    v_mov_b32_e32 v11, s47
; CI-NEXT:    v_mov_b32_e32 v12, s48
; CI-NEXT:    v_mov_b32_e32 v13, s49
; CI-NEXT:    v_mov_b32_e32 v14, s50
; CI-NEXT:    v_mov_b32_e32 v15, s51
; CI-NEXT:    v_mov_b32_e32 v16, s8
; CI-NEXT:    v_mov_b32_e32 v17, s9
; CI-NEXT:    v_mov_b32_e32 v18, s10
; CI-NEXT:    v_mov_b32_e32 v19, s11
; CI-NEXT:    v_mov_b32_e32 v20, s12
; CI-NEXT:    v_mov_b32_e32 v21, s13
; CI-NEXT:    v_mov_b32_e32 v22, s14
; CI-NEXT:    v_mov_b32_e32 v23, s15
; CI-NEXT:    v_mov_b32_e32 v24, s16
; CI-NEXT:    v_mov_b32_e32 v25, s17
; CI-NEXT:    v_mov_b32_e32 v26, s18
; CI-NEXT:    v_mov_b32_e32 v27, s19
; CI-NEXT:    v_mov_b32_e32 v28, s20
; CI-NEXT:    v_mov_b32_e32 v29, s21
; CI-NEXT:    v_mov_b32_e32 v30, s22
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s54, -1
; GFX9-NEXT:    s_mov_b32 s55, 0xe00000
; GFX9-NEXT:    s_add_u32 s52, s52, s5
; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa4
; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_addc_u32 s53, s53, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v0, s23
; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, s4
; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
; GFX9-NEXT:    v_mov_b32_e32 v0, s5
; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[52:53]
; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
; GFX9-NEXT:    s_mov_b64 s[2:3], s[54:55]
; GFX9-NEXT:    v_mov_b32_e32 v0, s36
; GFX9-NEXT:    v_mov_b32_e32 v1, s37
; GFX9-NEXT:    v_mov_b32_e32 v2, s38
; GFX9-NEXT:    v_mov_b32_e32 v3, s39
; GFX9-NEXT:    v_mov_b32_e32 v4, s40
; GFX9-NEXT:    v_mov_b32_e32 v5, s41
; GFX9-NEXT:    v_mov_b32_e32 v6, s42
; GFX9-NEXT:    v_mov_b32_e32 v7, s43
; GFX9-NEXT:    v_mov_b32_e32 v8, s44
; GFX9-NEXT:    v_mov_b32_e32 v9, s45
; GFX9-NEXT:    v_mov_b32_e32 v10, s46
; GFX9-NEXT:    v_mov_b32_e32 v11, s47
; GFX9-NEXT:    v_mov_b32_e32 v12, s48
; GFX9-NEXT:    v_mov_b32_e32 v13, s49
; GFX9-NEXT:    v_mov_b32_e32 v14, s50
; GFX9-NEXT:    v_mov_b32_e32 v15, s51
; GFX9-NEXT:    v_mov_b32_e32 v16, s8
; GFX9-NEXT:    v_mov_b32_e32 v17, s9
; GFX9-NEXT:    v_mov_b32_e32 v18, s10
; GFX9-NEXT:    v_mov_b32_e32 v19, s11
; GFX9-NEXT:    v_mov_b32_e32 v20, s12
; GFX9-NEXT:    v_mov_b32_e32 v21, s13
; GFX9-NEXT:    v_mov_b32_e32 v22, s14
; GFX9-NEXT:    v_mov_b32_e32 v23, s15
; GFX9-NEXT:    v_mov_b32_e32 v24, s16
; GFX9-NEXT:    v_mov_b32_e32 v25, s17
; GFX9-NEXT:    v_mov_b32_e32 v26, s18
; GFX9-NEXT:    v_mov_b32_e32 v27, s19
; GFX9-NEXT:    v_mov_b32_e32 v28, s20
; GFX9-NEXT:    v_mov_b32_e32 v29, s21
; GFX9-NEXT:    v_mov_b32_e32 v30, s22
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    s_endpgm
;
; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_clause 0x2
; GFX11-NEXT:    s_load_b64 s[20:21], s[2:3], 0xa4
; GFX11-NEXT:    s_load_b512 s[4:19], s[2:3], 0x64
; GFX11-NEXT:    s_load_b512 s[36:51], s[2:3], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_add_i32 s22, s32, 8
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20
; GFX11-NEXT:    v_mov_b32_e32 v2, s19
; GFX11-NEXT:    s_add_i32 s19, s32, 4
; GFX11-NEXT:    v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43
; GFX11-NEXT:    scratch_store_b32 off, v0, s22
; GFX11-NEXT:    scratch_store_b32 off, v1, s19
; GFX11-NEXT:    scratch_store_b32 off, v2, s32
; GFX11-NEXT:    v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39
; GFX11-NEXT:    v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38
; GFX11-NEXT:    v_dual_mov_b32 v5, s41 :: v_dual_mov_b32 v6, s42
; GFX11-NEXT:    v_dual_mov_b32 v9, s45 :: v_dual_mov_b32 v8, s44
; GFX11-NEXT:    v_dual_mov_b32 v11, s47 :: v_dual_mov_b32 v10, s46
; GFX11-NEXT:    v_dual_mov_b32 v13, s49 :: v_dual_mov_b32 v12, s48
; GFX11-NEXT:    v_dual_mov_b32 v15, s51 :: v_dual_mov_b32 v14, s50
; GFX11-NEXT:    v_dual_mov_b32 v17, s5 :: v_dual_mov_b32 v16, s4
; GFX11-NEXT:    v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6
; GFX11-NEXT:    v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v20, s8
; GFX11-NEXT:    v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v22, s10
; GFX11-NEXT:    v_dual_mov_b32 v25, s13 :: v_dual_mov_b32 v24, s12
; GFX11-NEXT:    v_dual_mov_b32 v27, s15 :: v_dual_mov_b32 v26, s14
; GFX11-NEXT:    v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16
; GFX11-NEXT:    v_mov_b32_e32 v30, s18
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, stack_passed_f64_arg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, stack_passed_f64_arg@rel32@hi+12
; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_endpgm
;
; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64:
; HSA:       ; %bb.0: ; %entry
; HSA-NEXT:    s_add_i32 s8, s8, s11
; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT:    s_add_u32 s0, s0, s11
; HSA-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x40
; HSA-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x80
; HSA-NEXT:    s_load_dwordx16 s[36:51], s[6:7], 0x0
; HSA-NEXT:    s_mov_b32 s32, 0
; HSA-NEXT:    s_addc_u32 s1, s1, 0
; HSA-NEXT:    s_waitcnt lgkmcnt(0)
; HSA-NEXT:    v_mov_b32_e32 v0, s23
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT:    v_mov_b32_e32 v0, s24
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT:    v_mov_b32_e32 v0, s25
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; HSA-NEXT:    s_getpc_b64 s[24:25]
; HSA-NEXT:    s_add_u32 s24, s24, stack_passed_f64_arg@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s25, s25, stack_passed_f64_arg@rel32@hi+12
; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT:    v_mov_b32_e32 v0, s36
; HSA-NEXT:    v_mov_b32_e32 v1, s37
; HSA-NEXT:    v_mov_b32_e32 v2, s38
; HSA-NEXT:    v_mov_b32_e32 v3, s39
; HSA-NEXT:    v_mov_b32_e32 v4, s40
; HSA-NEXT:    v_mov_b32_e32 v5, s41
; HSA-NEXT:    v_mov_b32_e32 v6, s42
; HSA-NEXT:    v_mov_b32_e32 v7, s43
; HSA-NEXT:    v_mov_b32_e32 v8, s44
; HSA-NEXT:    v_mov_b32_e32 v9, s45
; HSA-NEXT:    v_mov_b32_e32 v10, s46
; HSA-NEXT:    v_mov_b32_e32 v11, s47
; HSA-NEXT:    v_mov_b32_e32 v12, s48
; HSA-NEXT:    v_mov_b32_e32 v13, s49
; HSA-NEXT:    v_mov_b32_e32 v14, s50
; HSA-NEXT:    v_mov_b32_e32 v15, s51
; HSA-NEXT:    v_mov_b32_e32 v16, s8
; HSA-NEXT:    v_mov_b32_e32 v17, s9
; HSA-NEXT:    v_mov_b32_e32 v18, s10
; HSA-NEXT:    v_mov_b32_e32 v19, s11
; HSA-NEXT:    v_mov_b32_e32 v20, s12
; HSA-NEXT:    v_mov_b32_e32 v21, s13
; HSA-NEXT:    v_mov_b32_e32 v22, s14
; HSA-NEXT:    v_mov_b32_e32 v23, s15
; HSA-NEXT:    v_mov_b32_e32 v24, s16
; HSA-NEXT:    v_mov_b32_e32 v25, s17
; HSA-NEXT:    v_mov_b32_e32 v26, s18
; HSA-NEXT:    v_mov_b32_e32 v27, s19
; HSA-NEXT:    v_mov_b32_e32 v28, s20
; HSA-NEXT:    v_mov_b32_e32 v29, s21
; HSA-NEXT:    v_mov_b32_e32 v30, s22
; HSA-NEXT:    s_swappc_b64 s[30:31], s[24:25]
; HSA-NEXT:    s_endpgm
entry:
  call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
  ret void
}

define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; VI-LABEL: tail_call_byval_align16:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
; VI-NEXT:    s_waitcnt vmcnt(1)
; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
; VI-NEXT:    s_waitcnt vmcnt(2)
; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32
; VI-NEXT:    s_waitcnt vmcnt(1)
; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
; VI-NEXT:    s_setpc_b64 s[4:5]
;
; CI-LABEL: tail_call_byval_align16:
; CI:       ; %bb.0: ; %entry
; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
; CI-NEXT:    s_waitcnt vmcnt(1)
; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
; CI-NEXT:    s_waitcnt vmcnt(2)
; CI-NEXT:    buffer_store_dword v32, off, s[0:3], s32
; CI-NEXT:    s_waitcnt vmcnt(1)
; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
; CI-NEXT:    s_setpc_b64 s[4:5]
;
; GFX9-LABEL: tail_call_byval_align16:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
; GFX9-NEXT:    s_setpc_b64 s[4:5]
;
; GFX11-LABEL: tail_call_byval_align16:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    scratch_load_b32 v31, off, s32
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    scratch_store_b32 off, v31, s32
; GFX11-NEXT:    scratch_load_b64 v[31:32], off, s32 offset:24
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    scratch_store_b64 off, v[31:32], s32 offset:16
; GFX11-NEXT:    s_setpc_b64 s[0:1]
;
; HSA-LABEL: tail_call_byval_align16:
; HSA:       ; %bb.0: ; %entry
; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
; HSA-NEXT:    buffer_load_dword v32, off, s[0:3], s32
; HSA-NEXT:    s_getpc_b64 s[4:5]
; HSA-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
; HSA-NEXT:    s_waitcnt vmcnt(1)
; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
; HSA-NEXT:    s_waitcnt vmcnt(2)
; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32
; HSA-NEXT:    s_waitcnt vmcnt(1)
; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
; HSA-NEXT:    s_setpc_b64 s[4:5]
entry:
  %alloca = alloca double, align 8, addrspace(5)
  tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca)
  ret void
}

define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
; VI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
; VI-NEXT:    s_waitcnt vmcnt(2)
; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32
; VI-NEXT:    s_waitcnt vmcnt(2)
; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT:    s_waitcnt vmcnt(2)
; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT:    s_setpc_b64 s[4:5]
;
; CI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
; CI:       ; %bb.0: ; %entry
; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
; CI-NEXT:    s_waitcnt vmcnt(2)
; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32
; CI-NEXT:    s_waitcnt vmcnt(2)
; CI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
; CI-NEXT:    s_waitcnt vmcnt(2)
; CI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
; CI-NEXT:    s_setpc_b64 s[4:5]
;
; GFX9-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT:    s_setpc_b64 s[4:5]
;
; GFX11-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b32 v33, off, s32
; GFX11-NEXT:    scratch_load_b64 v[31:32], off, s32 offset:4
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    scratch_store_b32 off, v33, s32
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    scratch_store_b64 off, v[31:32], s32 offset:4
; GFX11-NEXT:    s_setpc_b64 s[0:1]
;
; HSA-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
; HSA:       ; %bb.0: ; %entry
; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32
; HSA-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
; HSA-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
; HSA-NEXT:    s_getpc_b64 s[4:5]
; HSA-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
; HSA-NEXT:    s_waitcnt vmcnt(2)
; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
; HSA-NEXT:    s_waitcnt vmcnt(2)
; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
; HSA-NEXT:    s_waitcnt vmcnt(2)
; HSA-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
; HSA-NEXT:    s_setpc_b64 s[4:5]
entry:
  tail call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
  ret void
}

define void @stack_12xv3i32() #0 {
; VI-LABEL: stack_12xv3i32:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT:    s_mov_b32 s4, s33
; VI-NEXT:    s_mov_b32 s33, s32
; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; VI-NEXT:    s_mov_b64 exec, s[8:9]
; VI-NEXT:    s_addk_i32 s32, 0x400
; VI-NEXT:    v_mov_b32_e32 v0, 11
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; VI-NEXT:    v_mov_b32_e32 v0, 12
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; VI-NEXT:    v_mov_b32_e32 v0, 13
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; VI-NEXT:    v_mov_b32_e32 v0, 14
; VI-NEXT:    v_writelane_b32 v40, s4, 2
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; VI-NEXT:    v_mov_b32_e32 v0, 15
; VI-NEXT:    v_writelane_b32 v40, s30, 0
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12
; VI-NEXT:    v_mov_b32_e32 v0, 0
; VI-NEXT:    v_mov_b32_e32 v1, 0
; VI-NEXT:    v_mov_b32_e32 v2, 0
; VI-NEXT:    v_mov_b32_e32 v3, 1
; VI-NEXT:    v_mov_b32_e32 v4, 1
; VI-NEXT:    v_mov_b32_e32 v5, 1
; VI-NEXT:    v_mov_b32_e32 v6, 2
; VI-NEXT:    v_mov_b32_e32 v7, 2
; VI-NEXT:    v_mov_b32_e32 v8, 2
; VI-NEXT:    v_mov_b32_e32 v9, 3
; VI-NEXT:    v_mov_b32_e32 v10, 3
; VI-NEXT:    v_mov_b32_e32 v11, 3
; VI-NEXT:    v_mov_b32_e32 v12, 4
; VI-NEXT:    v_mov_b32_e32 v13, 4
; VI-NEXT:    v_mov_b32_e32 v14, 4
; VI-NEXT:    v_mov_b32_e32 v15, 5
; VI-NEXT:    v_mov_b32_e32 v16, 5
; VI-NEXT:    v_mov_b32_e32 v17, 5
; VI-NEXT:    v_mov_b32_e32 v18, 6
; VI-NEXT:    v_mov_b32_e32 v19, 6
; VI-NEXT:    v_mov_b32_e32 v20, 6
; VI-NEXT:    v_mov_b32_e32 v21, 7
; VI-NEXT:    v_mov_b32_e32 v22, 7
; VI-NEXT:    v_mov_b32_e32 v23, 7
; VI-NEXT:    v_mov_b32_e32 v24, 8
; VI-NEXT:    v_mov_b32_e32 v25, 8
; VI-NEXT:    v_mov_b32_e32 v26, 8
; VI-NEXT:    v_mov_b32_e32 v27, 9
; VI-NEXT:    v_mov_b32_e32 v28, 9
; VI-NEXT:    v_mov_b32_e32 v29, 9
; VI-NEXT:    v_mov_b32_e32 v30, 10
; VI-NEXT:    v_writelane_b32 v40, s31, 1
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    v_readlane_b32 s31, v40, 1
; VI-NEXT:    v_readlane_b32 s30, v40, 0
; VI-NEXT:    s_mov_b32 s32, s33
; VI-NEXT:    v_readlane_b32 s4, v40, 2
; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; VI-NEXT:    s_mov_b64 exec, s[6:7]
; VI-NEXT:    s_mov_b32 s33, s4
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_setpc_b64 s[30:31]
;
; CI-LABEL: stack_12xv3i32:
; CI:       ; %bb.0: ; %entry
; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT:    s_mov_b32 s4, s33
; CI-NEXT:    s_mov_b32 s33, s32
; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CI-NEXT:    s_mov_b64 exec, s[8:9]
; CI-NEXT:    s_addk_i32 s32, 0x400
; CI-NEXT:    v_mov_b32_e32 v0, 11
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; CI-NEXT:    v_mov_b32_e32 v0, 12
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; CI-NEXT:    v_mov_b32_e32 v0, 13
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; CI-NEXT:    v_mov_b32_e32 v0, 14
; CI-NEXT:    v_writelane_b32 v40, s4, 2
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; CI-NEXT:    v_mov_b32_e32 v0, 15
; CI-NEXT:    v_writelane_b32 v40, s30, 0
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12
; CI-NEXT:    v_mov_b32_e32 v0, 0
; CI-NEXT:    v_mov_b32_e32 v1, 0
; CI-NEXT:    v_mov_b32_e32 v2, 0
; CI-NEXT:    v_mov_b32_e32 v3, 1
; CI-NEXT:    v_mov_b32_e32 v4, 1
; CI-NEXT:    v_mov_b32_e32 v5, 1
; CI-NEXT:    v_mov_b32_e32 v6, 2
; CI-NEXT:    v_mov_b32_e32 v7, 2
; CI-NEXT:    v_mov_b32_e32 v8, 2
; CI-NEXT:    v_mov_b32_e32 v9, 3
; CI-NEXT:    v_mov_b32_e32 v10, 3
; CI-NEXT:    v_mov_b32_e32 v11, 3
; CI-NEXT:    v_mov_b32_e32 v12, 4
; CI-NEXT:    v_mov_b32_e32 v13, 4
; CI-NEXT:    v_mov_b32_e32 v14, 4
; CI-NEXT:    v_mov_b32_e32 v15, 5
; CI-NEXT:    v_mov_b32_e32 v16, 5
; CI-NEXT:    v_mov_b32_e32 v17, 5
; CI-NEXT:    v_mov_b32_e32 v18, 6
; CI-NEXT:    v_mov_b32_e32 v19, 6
; CI-NEXT:    v_mov_b32_e32 v20, 6
; CI-NEXT:    v_mov_b32_e32 v21, 7
; CI-NEXT:    v_mov_b32_e32 v22, 7
; CI-NEXT:    v_mov_b32_e32 v23, 7
; CI-NEXT:    v_mov_b32_e32 v24, 8
; CI-NEXT:    v_mov_b32_e32 v25, 8
; CI-NEXT:    v_mov_b32_e32 v26, 8
; CI-NEXT:    v_mov_b32_e32 v27, 9
; CI-NEXT:    v_mov_b32_e32 v28, 9
; CI-NEXT:    v_mov_b32_e32 v29, 9
; CI-NEXT:    v_mov_b32_e32 v30, 10
; CI-NEXT:    v_writelane_b32 v40, s31, 1
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    v_readlane_b32 s31, v40, 1
; CI-NEXT:    v_readlane_b32 s30, v40, 0
; CI-NEXT:    s_mov_b32 s32, s33
; CI-NEXT:    v_readlane_b32 s4, v40, 2
; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CI-NEXT:    s_mov_b64 exec, s[6:7]
; CI-NEXT:    s_mov_b32 s33, s4
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: stack_12xv3i32:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s4, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_mov_b32_e32 v0, 11
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, 12
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT:    v_mov_b32_e32 v0, 13
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT:    v_mov_b32_e32 v0, 14
; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT:    v_mov_b32_e32 v0, 15
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 1
; GFX9-NEXT:    v_mov_b32_e32 v4, 1
; GFX9-NEXT:    v_mov_b32_e32 v5, 1
; GFX9-NEXT:    v_mov_b32_e32 v6, 2
; GFX9-NEXT:    v_mov_b32_e32 v7, 2
; GFX9-NEXT:    v_mov_b32_e32 v8, 2
; GFX9-NEXT:    v_mov_b32_e32 v9, 3
; GFX9-NEXT:    v_mov_b32_e32 v10, 3
; GFX9-NEXT:    v_mov_b32_e32 v11, 3
; GFX9-NEXT:    v_mov_b32_e32 v12, 4
; GFX9-NEXT:    v_mov_b32_e32 v13, 4
; GFX9-NEXT:    v_mov_b32_e32 v14, 4
; GFX9-NEXT:    v_mov_b32_e32 v15, 5
; GFX9-NEXT:    v_mov_b32_e32 v16, 5
; GFX9-NEXT:    v_mov_b32_e32 v17, 5
; GFX9-NEXT:    v_mov_b32_e32 v18, 6
; GFX9-NEXT:    v_mov_b32_e32 v19, 6
; GFX9-NEXT:    v_mov_b32_e32 v20, 6
; GFX9-NEXT:    v_mov_b32_e32 v21, 7
; GFX9-NEXT:    v_mov_b32_e32 v22, 7
; GFX9-NEXT:    v_mov_b32_e32 v23, 7
; GFX9-NEXT:    v_mov_b32_e32 v24, 8
; GFX9-NEXT:    v_mov_b32_e32 v25, 8
; GFX9-NEXT:    v_mov_b32_e32 v26, 8
; GFX9-NEXT:    v_mov_b32_e32 v27, 9
; GFX9-NEXT:    v_mov_b32_e32 v28, 9
; GFX9-NEXT:    v_mov_b32_e32 v29, 9
; GFX9-NEXT:    v_mov_b32_e32 v30, 10
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    s_mov_b32 s32, s33
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: stack_12xv3i32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12
; GFX11-NEXT:    v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14
; GFX11-NEXT:    v_mov_b32_e32 v4, 15
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s0, s32, 16
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    scratch_store_b32 off, v4, s0
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 1
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT:    v_dual_mov_b32 v5, 1 :: v_dual_mov_b32 v4, 1
; GFX11-NEXT:    v_dual_mov_b32 v7, 2 :: v_dual_mov_b32 v6, 2
; GFX11-NEXT:    v_dual_mov_b32 v9, 3 :: v_dual_mov_b32 v8, 2
; GFX11-NEXT:    v_dual_mov_b32 v11, 3 :: v_dual_mov_b32 v10, 3
; GFX11-NEXT:    v_dual_mov_b32 v13, 4 :: v_dual_mov_b32 v12, 4
; GFX11-NEXT:    v_dual_mov_b32 v15, 5 :: v_dual_mov_b32 v14, 4
; GFX11-NEXT:    v_dual_mov_b32 v17, 5 :: v_dual_mov_b32 v16, 5
; GFX11-NEXT:    v_dual_mov_b32 v19, 6 :: v_dual_mov_b32 v18, 6
; GFX11-NEXT:    v_dual_mov_b32 v21, 7 :: v_dual_mov_b32 v20, 6
; GFX11-NEXT:    v_dual_mov_b32 v23, 7 :: v_dual_mov_b32 v22, 7
; GFX11-NEXT:    v_dual_mov_b32 v25, 8 :: v_dual_mov_b32 v24, 8
; GFX11-NEXT:    v_dual_mov_b32 v27, 9 :: v_dual_mov_b32 v26, 8
; GFX11-NEXT:    v_dual_mov_b32 v29, 9 :: v_dual_mov_b32 v28, 9
; GFX11-NEXT:    v_mov_b32_e32 v30, 10
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    s_mov_b32 s32, s33
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; HSA-LABEL: stack_12xv3i32:
; HSA:       ; %bb.0: ; %entry
; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT:    s_mov_b32 s4, s33
; HSA-NEXT:    s_mov_b32 s33, s32
; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HSA-NEXT:    s_mov_b64 exec, s[8:9]
; HSA-NEXT:    s_addk_i32 s32, 0x400
; HSA-NEXT:    v_mov_b32_e32 v0, 11
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT:    v_mov_b32_e32 v0, 12
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT:    v_mov_b32_e32 v0, 13
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; HSA-NEXT:    v_mov_b32_e32 v0, 14
; HSA-NEXT:    v_writelane_b32 v40, s4, 2
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; HSA-NEXT:    v_mov_b32_e32 v0, 15
; HSA-NEXT:    v_writelane_b32 v40, s30, 0
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; HSA-NEXT:    s_getpc_b64 s[4:5]
; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12
; HSA-NEXT:    v_mov_b32_e32 v0, 0
; HSA-NEXT:    v_mov_b32_e32 v1, 0
; HSA-NEXT:    v_mov_b32_e32 v2, 0
; HSA-NEXT:    v_mov_b32_e32 v3, 1
; HSA-NEXT:    v_mov_b32_e32 v4, 1
; HSA-NEXT:    v_mov_b32_e32 v5, 1
; HSA-NEXT:    v_mov_b32_e32 v6, 2
; HSA-NEXT:    v_mov_b32_e32 v7, 2
; HSA-NEXT:    v_mov_b32_e32 v8, 2
; HSA-NEXT:    v_mov_b32_e32 v9, 3
; HSA-NEXT:    v_mov_b32_e32 v10, 3
; HSA-NEXT:    v_mov_b32_e32 v11, 3
; HSA-NEXT:    v_mov_b32_e32 v12, 4
; HSA-NEXT:    v_mov_b32_e32 v13, 4
; HSA-NEXT:    v_mov_b32_e32 v14, 4
; HSA-NEXT:    v_mov_b32_e32 v15, 5
; HSA-NEXT:    v_mov_b32_e32 v16, 5
; HSA-NEXT:    v_mov_b32_e32 v17, 5
; HSA-NEXT:    v_mov_b32_e32 v18, 6
; HSA-NEXT:    v_mov_b32_e32 v19, 6
; HSA-NEXT:    v_mov_b32_e32 v20, 6
; HSA-NEXT:    v_mov_b32_e32 v21, 7
; HSA-NEXT:    v_mov_b32_e32 v22, 7
; HSA-NEXT:    v_mov_b32_e32 v23, 7
; HSA-NEXT:    v_mov_b32_e32 v24, 8
; HSA-NEXT:    v_mov_b32_e32 v25, 8
; HSA-NEXT:    v_mov_b32_e32 v26, 8
; HSA-NEXT:    v_mov_b32_e32 v27, 9
; HSA-NEXT:    v_mov_b32_e32 v28, 9
; HSA-NEXT:    v_mov_b32_e32 v29, 9
; HSA-NEXT:    v_mov_b32_e32 v30, 10
; HSA-NEXT:    v_writelane_b32 v40, s31, 1
; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT:    v_readlane_b32 s31, v40, 1
; HSA-NEXT:    v_readlane_b32 s30, v40, 0
; HSA-NEXT:    s_mov_b32 s32, s33
; HSA-NEXT:    v_readlane_b32 s4, v40, 2
; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; HSA-NEXT:    s_mov_b64 exec, s[6:7]
; HSA-NEXT:    s_mov_b32 s33, s4
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_setpc_b64 s[30:31]
entry:
  call void @external_void_func_12xv3i32(
      <3 x i32><i32 0, i32 0, i32 0>,
      <3 x i32><i32 1, i32 1, i32 1>,
      <3 x i32><i32 2, i32 2, i32 2>,
      <3 x i32><i32 3, i32 3, i32 3>,
      <3 x i32><i32 4, i32 4, i32 4>,
      <3 x i32><i32 5, i32 5, i32 5>,
      <3 x i32><i32 6, i32 6, i32 6>,
      <3 x i32><i32 7, i32 7, i32 7>,
      <3 x i32><i32 8, i32 8, i32 8>,
      <3 x i32><i32 9, i32 9, i32 9>,
      <3 x i32><i32 10, i32 11, i32 12>,
      <3 x i32><i32 13, i32 14, i32 15>)
  ret void
}

define void @stack_12xv3f32() #0 {
; VI-LABEL: stack_12xv3f32:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT:    s_mov_b32 s4, s33
; VI-NEXT:    s_mov_b32 s33, s32
; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; VI-NEXT:    s_mov_b64 exec, s[8:9]
; VI-NEXT:    s_addk_i32 s32, 0x400
; VI-NEXT:    v_mov_b32_e32 v0, 0x41300000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; VI-NEXT:    v_mov_b32_e32 v0, 0x41400000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; VI-NEXT:    v_mov_b32_e32 v0, 0x41500000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; VI-NEXT:    v_mov_b32_e32 v0, 0x41600000
; VI-NEXT:    v_writelane_b32 v40, s4, 2
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; VI-NEXT:    v_mov_b32_e32 v0, 0x41700000
; VI-NEXT:    v_writelane_b32 v40, s30, 0
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12
; VI-NEXT:    v_mov_b32_e32 v0, 0
; VI-NEXT:    v_mov_b32_e32 v1, 0
; VI-NEXT:    v_mov_b32_e32 v2, 0
; VI-NEXT:    v_mov_b32_e32 v3, 1.0
; VI-NEXT:    v_mov_b32_e32 v4, 1.0
; VI-NEXT:    v_mov_b32_e32 v5, 1.0
; VI-NEXT:    v_mov_b32_e32 v6, 2.0
; VI-NEXT:    v_mov_b32_e32 v7, 2.0
; VI-NEXT:    v_mov_b32_e32 v8, 2.0
; VI-NEXT:    v_mov_b32_e32 v9, 0x40400000
; VI-NEXT:    v_mov_b32_e32 v10, 0x40400000
; VI-NEXT:    v_mov_b32_e32 v11, 0x40400000
; VI-NEXT:    v_mov_b32_e32 v12, 4.0
; VI-NEXT:    v_mov_b32_e32 v13, 4.0
; VI-NEXT:    v_mov_b32_e32 v14, 4.0
; VI-NEXT:    v_mov_b32_e32 v15, 0x40a00000
; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
; VI-NEXT:    v_mov_b32_e32 v17, 0x40a00000
; VI-NEXT:    v_mov_b32_e32 v18, 0x40c00000
; VI-NEXT:    v_mov_b32_e32 v19, 0x40c00000
; VI-NEXT:    v_mov_b32_e32 v20, 0x40c00000
; VI-NEXT:    v_mov_b32_e32 v21, 0x40e00000
; VI-NEXT:    v_mov_b32_e32 v22, 0x40e00000
; VI-NEXT:    v_mov_b32_e32 v23, 0x40e00000
; VI-NEXT:    v_mov_b32_e32 v24, 0x41000000
; VI-NEXT:    v_mov_b32_e32 v25, 0x41000000
; VI-NEXT:    v_mov_b32_e32 v26, 0x41000000
; VI-NEXT:    v_mov_b32_e32 v27, 0x41100000
; VI-NEXT:    v_mov_b32_e32 v28, 0x41100000
; VI-NEXT:    v_mov_b32_e32 v29, 0x41100000
; VI-NEXT:    v_mov_b32_e32 v30, 0x41200000
; VI-NEXT:    v_writelane_b32 v40, s31, 1
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    v_readlane_b32 s31, v40, 1
; VI-NEXT:    v_readlane_b32 s30, v40, 0
; VI-NEXT:    s_mov_b32 s32, s33
; VI-NEXT:    v_readlane_b32 s4, v40, 2
; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; VI-NEXT:    s_mov_b64 exec, s[6:7]
; VI-NEXT:    s_mov_b32 s33, s4
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_setpc_b64 s[30:31]
;
; CI-LABEL: stack_12xv3f32:
; CI:       ; %bb.0: ; %entry
; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT:    s_mov_b32 s4, s33
; CI-NEXT:    s_mov_b32 s33, s32
; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CI-NEXT:    s_mov_b64 exec, s[8:9]
; CI-NEXT:    s_addk_i32 s32, 0x400
; CI-NEXT:    v_mov_b32_e32 v0, 0x41300000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; CI-NEXT:    v_mov_b32_e32 v0, 0x41400000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; CI-NEXT:    v_mov_b32_e32 v0, 0x41500000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; CI-NEXT:    v_mov_b32_e32 v0, 0x41600000
; CI-NEXT:    v_writelane_b32 v40, s4, 2
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; CI-NEXT:    v_mov_b32_e32 v0, 0x41700000
; CI-NEXT:    v_writelane_b32 v40, s30, 0
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12
; CI-NEXT:    v_mov_b32_e32 v0, 0
; CI-NEXT:    v_mov_b32_e32 v1, 0
; CI-NEXT:    v_mov_b32_e32 v2, 0
; CI-NEXT:    v_mov_b32_e32 v3, 1.0
; CI-NEXT:    v_mov_b32_e32 v4, 1.0
; CI-NEXT:    v_mov_b32_e32 v5, 1.0
; CI-NEXT:    v_mov_b32_e32 v6, 2.0
; CI-NEXT:    v_mov_b32_e32 v7, 2.0
; CI-NEXT:    v_mov_b32_e32 v8, 2.0
; CI-NEXT:    v_mov_b32_e32 v9, 0x40400000
; CI-NEXT:    v_mov_b32_e32 v10, 0x40400000
; CI-NEXT:    v_mov_b32_e32 v11, 0x40400000
; CI-NEXT:    v_mov_b32_e32 v12, 4.0
; CI-NEXT:    v_mov_b32_e32 v13, 4.0
; CI-NEXT:    v_mov_b32_e32 v14, 4.0
; CI-NEXT:    v_mov_b32_e32 v15, 0x40a00000
; CI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
; CI-NEXT:    v_mov_b32_e32 v17, 0x40a00000
; CI-NEXT:    v_mov_b32_e32 v18, 0x40c00000
; CI-NEXT:    v_mov_b32_e32 v19, 0x40c00000
; CI-NEXT:    v_mov_b32_e32 v20, 0x40c00000
; CI-NEXT:    v_mov_b32_e32 v21, 0x40e00000
; CI-NEXT:    v_mov_b32_e32 v22, 0x40e00000
; CI-NEXT:    v_mov_b32_e32 v23, 0x40e00000
; CI-NEXT:    v_mov_b32_e32 v24, 0x41000000
; CI-NEXT:    v_mov_b32_e32 v25, 0x41000000
; CI-NEXT:    v_mov_b32_e32 v26, 0x41000000
; CI-NEXT:    v_mov_b32_e32 v27, 0x41100000
; CI-NEXT:    v_mov_b32_e32 v28, 0x41100000
; CI-NEXT:    v_mov_b32_e32 v29, 0x41100000
; CI-NEXT:    v_mov_b32_e32 v30, 0x41200000
; CI-NEXT:    v_writelane_b32 v40, s31, 1
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    v_readlane_b32 s31, v40, 1
; CI-NEXT:    v_readlane_b32 s30, v40, 0
; CI-NEXT:    s_mov_b32 s32, s33
; CI-NEXT:    v_readlane_b32 s4, v40, 2
; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CI-NEXT:    s_mov_b64 exec, s[6:7]
; CI-NEXT:    s_mov_b32 s33, s4
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: stack_12xv3f32:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s4, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41300000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41400000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v4, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v5, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v6, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v7, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v8, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v9, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v10, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v11, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v12, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v13, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v14, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v15, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v16, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v17, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v18, 0x40c00000
; GFX9-NEXT:    v_mov_b32_e32 v19, 0x40c00000
; GFX9-NEXT:    v_mov_b32_e32 v20, 0x40c00000
; GFX9-NEXT:    v_mov_b32_e32 v21, 0x40e00000
; GFX9-NEXT:    v_mov_b32_e32 v22, 0x40e00000
; GFX9-NEXT:    v_mov_b32_e32 v23, 0x40e00000
; GFX9-NEXT:    v_mov_b32_e32 v24, 0x41000000
; GFX9-NEXT:    v_mov_b32_e32 v25, 0x41000000
; GFX9-NEXT:    v_mov_b32_e32 v26, 0x41000000
; GFX9-NEXT:    v_mov_b32_e32 v27, 0x41100000
; GFX9-NEXT:    v_mov_b32_e32 v28, 0x41100000
; GFX9-NEXT:    v_mov_b32_e32 v29, 0x41100000
; GFX9-NEXT:    v_mov_b32_e32 v30, 0x41200000
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    s_mov_b32 s32, s33
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: stack_12xv3f32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41300000
; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41400000
; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41500000
; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41600000
; GFX11-NEXT:    v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s0, s32, 16
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    scratch_store_b32 off, v4, s0
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0
; GFX11-NEXT:    v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v6, 2.0 :: v_dual_mov_b32 v9, 0x40400000
; GFX11-NEXT:    v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v11, 0x40400000
; GFX11-NEXT:    v_dual_mov_b32 v10, 0x40400000 :: v_dual_mov_b32 v13, 4.0
; GFX11-NEXT:    v_dual_mov_b32 v12, 4.0 :: v_dual_mov_b32 v15, 0x40a00000
; GFX11-NEXT:    v_dual_mov_b32 v14, 4.0 :: v_dual_mov_b32 v17, 0x40a00000
; GFX11-NEXT:    v_mov_b32_e32 v16, 0x40a00000
; GFX11-NEXT:    v_dual_mov_b32 v18, 0x40c00000 :: v_dual_mov_b32 v19, 0x40c00000
; GFX11-NEXT:    v_mov_b32_e32 v20, 0x40c00000
; GFX11-NEXT:    v_dual_mov_b32 v21, 0x40e00000 :: v_dual_mov_b32 v22, 0x40e00000
; GFX11-NEXT:    v_mov_b32_e32 v23, 0x40e00000
; GFX11-NEXT:    v_dual_mov_b32 v24, 0x41000000 :: v_dual_mov_b32 v25, 0x41000000
; GFX11-NEXT:    v_mov_b32_e32 v26, 0x41000000
; GFX11-NEXT:    v_dual_mov_b32 v27, 0x41100000 :: v_dual_mov_b32 v28, 0x41100000
; GFX11-NEXT:    v_mov_b32_e32 v29, 0x41100000
; GFX11-NEXT:    v_mov_b32_e32 v30, 0x41200000
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3f32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3f32@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    s_mov_b32 s32, s33
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; HSA-LABEL: stack_12xv3f32:
; HSA:       ; %bb.0: ; %entry
; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT:    s_mov_b32 s4, s33
; HSA-NEXT:    s_mov_b32 s33, s32
; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HSA-NEXT:    s_mov_b64 exec, s[8:9]
; HSA-NEXT:    s_addk_i32 s32, 0x400
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41300000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41400000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41500000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41600000
; HSA-NEXT:    v_writelane_b32 v40, s4, 2
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41700000
; HSA-NEXT:    v_writelane_b32 v40, s30, 0
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; HSA-NEXT:    s_getpc_b64 s[4:5]
; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12
; HSA-NEXT:    v_mov_b32_e32 v0, 0
; HSA-NEXT:    v_mov_b32_e32 v1, 0
; HSA-NEXT:    v_mov_b32_e32 v2, 0
; HSA-NEXT:    v_mov_b32_e32 v3, 1.0
; HSA-NEXT:    v_mov_b32_e32 v4, 1.0
; HSA-NEXT:    v_mov_b32_e32 v5, 1.0
; HSA-NEXT:    v_mov_b32_e32 v6, 2.0
; HSA-NEXT:    v_mov_b32_e32 v7, 2.0
; HSA-NEXT:    v_mov_b32_e32 v8, 2.0
; HSA-NEXT:    v_mov_b32_e32 v9, 0x40400000
; HSA-NEXT:    v_mov_b32_e32 v10, 0x40400000
; HSA-NEXT:    v_mov_b32_e32 v11, 0x40400000
; HSA-NEXT:    v_mov_b32_e32 v12, 4.0
; HSA-NEXT:    v_mov_b32_e32 v13, 4.0
; HSA-NEXT:    v_mov_b32_e32 v14, 4.0
; HSA-NEXT:    v_mov_b32_e32 v15, 0x40a00000
; HSA-NEXT:    v_mov_b32_e32 v16, 0x40a00000
; HSA-NEXT:    v_mov_b32_e32 v17, 0x40a00000
; HSA-NEXT:    v_mov_b32_e32 v18, 0x40c00000
; HSA-NEXT:    v_mov_b32_e32 v19, 0x40c00000
; HSA-NEXT:    v_mov_b32_e32 v20, 0x40c00000
; HSA-NEXT:    v_mov_b32_e32 v21, 0x40e00000
; HSA-NEXT:    v_mov_b32_e32 v22, 0x40e00000
; HSA-NEXT:    v_mov_b32_e32 v23, 0x40e00000
; HSA-NEXT:    v_mov_b32_e32 v24, 0x41000000
; HSA-NEXT:    v_mov_b32_e32 v25, 0x41000000
; HSA-NEXT:    v_mov_b32_e32 v26, 0x41000000
; HSA-NEXT:    v_mov_b32_e32 v27, 0x41100000
; HSA-NEXT:    v_mov_b32_e32 v28, 0x41100000
; HSA-NEXT:    v_mov_b32_e32 v29, 0x41100000
; HSA-NEXT:    v_mov_b32_e32 v30, 0x41200000
; HSA-NEXT:    v_writelane_b32 v40, s31, 1
; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT:    v_readlane_b32 s31, v40, 1
; HSA-NEXT:    v_readlane_b32 s30, v40, 0
; HSA-NEXT:    s_mov_b32 s32, s33
; HSA-NEXT:    v_readlane_b32 s4, v40, 2
; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; HSA-NEXT:    s_mov_b64 exec, s[6:7]
; HSA-NEXT:    s_mov_b32 s33, s4
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_setpc_b64 s[30:31]
entry:
  call void @external_void_func_12xv3f32(
      <3 x float><float 0.0, float 0.0, float 0.0>,
      <3 x float><float 1.0, float 1.0, float 1.0>,
      <3 x float><float 2.0, float 2.0, float 2.0>,
      <3 x float><float 3.0, float 3.0, float 3.0>,
      <3 x float><float 4.0, float 4.0, float 4.0>,
      <3 x float><float 5.0, float 5.0, float 5.0>,
      <3 x float><float 6.0, float 6.0, float 6.0>,
      <3 x float><float 7.0, float 7.0, float 7.0>,
      <3 x float><float 8.0, float 8.0, float 8.0>,
      <3 x float><float 9.0, float 9.0, float 9.0>,
      <3 x float><float 10.0, float 11.0, float 12.0>,
      <3 x float><float 13.0, float 14.0, float 15.0>)
  ret void
}

define void @stack_8xv5i32() #0 {
; VI-LABEL: stack_8xv5i32:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT:    s_mov_b32 s4, s33
; VI-NEXT:    s_mov_b32 s33, s32
; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; VI-NEXT:    s_mov_b64 exec, s[8:9]
; VI-NEXT:    s_addk_i32 s32, 0x400
; VI-NEXT:    v_mov_b32_e32 v0, 7
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; VI-NEXT:    v_mov_b32_e32 v0, 8
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; VI-NEXT:    v_mov_b32_e32 v0, 9
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; VI-NEXT:    v_mov_b32_e32 v0, 10
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; VI-NEXT:    v_mov_b32_e32 v0, 11
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; VI-NEXT:    v_mov_b32_e32 v0, 12
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; VI-NEXT:    v_mov_b32_e32 v0, 13
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; VI-NEXT:    v_mov_b32_e32 v0, 14
; VI-NEXT:    v_writelane_b32 v40, s4, 2
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; VI-NEXT:    v_mov_b32_e32 v0, 15
; VI-NEXT:    v_writelane_b32 v40, s30, 0
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12
; VI-NEXT:    v_mov_b32_e32 v0, 0
; VI-NEXT:    v_mov_b32_e32 v1, 0
; VI-NEXT:    v_mov_b32_e32 v2, 0
; VI-NEXT:    v_mov_b32_e32 v3, 0
; VI-NEXT:    v_mov_b32_e32 v4, 0
; VI-NEXT:    v_mov_b32_e32 v5, 1
; VI-NEXT:    v_mov_b32_e32 v6, 1
; VI-NEXT:    v_mov_b32_e32 v7, 1
; VI-NEXT:    v_mov_b32_e32 v8, 1
; VI-NEXT:    v_mov_b32_e32 v9, 1
; VI-NEXT:    v_mov_b32_e32 v10, 2
; VI-NEXT:    v_mov_b32_e32 v11, 2
; VI-NEXT:    v_mov_b32_e32 v12, 2
; VI-NEXT:    v_mov_b32_e32 v13, 2
; VI-NEXT:    v_mov_b32_e32 v14, 2
; VI-NEXT:    v_mov_b32_e32 v15, 3
; VI-NEXT:    v_mov_b32_e32 v16, 3
; VI-NEXT:    v_mov_b32_e32 v17, 3
; VI-NEXT:    v_mov_b32_e32 v18, 3
; VI-NEXT:    v_mov_b32_e32 v19, 3
; VI-NEXT:    v_mov_b32_e32 v20, 4
; VI-NEXT:    v_mov_b32_e32 v21, 4
; VI-NEXT:    v_mov_b32_e32 v22, 4
; VI-NEXT:    v_mov_b32_e32 v23, 4
; VI-NEXT:    v_mov_b32_e32 v24, 4
; VI-NEXT:    v_mov_b32_e32 v25, 5
; VI-NEXT:    v_mov_b32_e32 v26, 5
; VI-NEXT:    v_mov_b32_e32 v27, 5
; VI-NEXT:    v_mov_b32_e32 v28, 5
; VI-NEXT:    v_mov_b32_e32 v29, 5
; VI-NEXT:    v_mov_b32_e32 v30, 6
; VI-NEXT:    v_writelane_b32 v40, s31, 1
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    v_readlane_b32 s31, v40, 1
; VI-NEXT:    v_readlane_b32 s30, v40, 0
; VI-NEXT:    s_mov_b32 s32, s33
; VI-NEXT:    v_readlane_b32 s4, v40, 2
; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; VI-NEXT:    s_mov_b64 exec, s[6:7]
; VI-NEXT:    s_mov_b32 s33, s4
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_setpc_b64 s[30:31]
;
; CI-LABEL: stack_8xv5i32:
; CI:       ; %bb.0: ; %entry
; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT:    s_mov_b32 s4, s33
; CI-NEXT:    s_mov_b32 s33, s32
; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CI-NEXT:    s_mov_b64 exec, s[8:9]
; CI-NEXT:    s_addk_i32 s32, 0x400
; CI-NEXT:    v_mov_b32_e32 v0, 7
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; CI-NEXT:    v_mov_b32_e32 v0, 8
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; CI-NEXT:    v_mov_b32_e32 v0, 9
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; CI-NEXT:    v_mov_b32_e32 v0, 10
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; CI-NEXT:    v_mov_b32_e32 v0, 11
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; CI-NEXT:    v_mov_b32_e32 v0, 12
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; CI-NEXT:    v_mov_b32_e32 v0, 13
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; CI-NEXT:    v_mov_b32_e32 v0, 14
; CI-NEXT:    v_writelane_b32 v40, s4, 2
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; CI-NEXT:    v_mov_b32_e32 v0, 15
; CI-NEXT:    v_writelane_b32 v40, s30, 0
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12
; CI-NEXT:    v_mov_b32_e32 v0, 0
; CI-NEXT:    v_mov_b32_e32 v1, 0
; CI-NEXT:    v_mov_b32_e32 v2, 0
; CI-NEXT:    v_mov_b32_e32 v3, 0
; CI-NEXT:    v_mov_b32_e32 v4, 0
; CI-NEXT:    v_mov_b32_e32 v5, 1
; CI-NEXT:    v_mov_b32_e32 v6, 1
; CI-NEXT:    v_mov_b32_e32 v7, 1
; CI-NEXT:    v_mov_b32_e32 v8, 1
; CI-NEXT:    v_mov_b32_e32 v9, 1
; CI-NEXT:    v_mov_b32_e32 v10, 2
; CI-NEXT:    v_mov_b32_e32 v11, 2
; CI-NEXT:    v_mov_b32_e32 v12, 2
; CI-NEXT:    v_mov_b32_e32 v13, 2
; CI-NEXT:    v_mov_b32_e32 v14, 2
; CI-NEXT:    v_mov_b32_e32 v15, 3
; CI-NEXT:    v_mov_b32_e32 v16, 3
; CI-NEXT:    v_mov_b32_e32 v17, 3
; CI-NEXT:    v_mov_b32_e32 v18, 3
; CI-NEXT:    v_mov_b32_e32 v19, 3
; CI-NEXT:    v_mov_b32_e32 v20, 4
; CI-NEXT:    v_mov_b32_e32 v21, 4
; CI-NEXT:    v_mov_b32_e32 v22, 4
; CI-NEXT:    v_mov_b32_e32 v23, 4
; CI-NEXT:    v_mov_b32_e32 v24, 4
; CI-NEXT:    v_mov_b32_e32 v25, 5
; CI-NEXT:    v_mov_b32_e32 v26, 5
; CI-NEXT:    v_mov_b32_e32 v27, 5
; CI-NEXT:    v_mov_b32_e32 v28, 5
; CI-NEXT:    v_mov_b32_e32 v29, 5
; CI-NEXT:    v_mov_b32_e32 v30, 6
; CI-NEXT:    v_writelane_b32 v40, s31, 1
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    v_readlane_b32 s31, v40, 1
; CI-NEXT:    v_readlane_b32 s30, v40, 0
; CI-NEXT:    s_mov_b32 s32, s33
; CI-NEXT:    v_readlane_b32 s4, v40, 2
; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CI-NEXT:    s_mov_b64 exec, s[6:7]
; CI-NEXT:    s_mov_b32 s33, s4
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: stack_8xv5i32:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s4, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_mov_b32_e32 v0, 7
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT:    v_mov_b32_e32 v0, 9
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT:    v_mov_b32_e32 v0, 10
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT:    v_mov_b32_e32 v0, 11
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT:    v_mov_b32_e32 v0, 12
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT:    v_mov_b32_e32 v0, 13
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT:    v_mov_b32_e32 v0, 14
; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; GFX9-NEXT:    v_mov_b32_e32 v0, 15
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 0
; GFX9-NEXT:    v_mov_b32_e32 v4, 0
; GFX9-NEXT:    v_mov_b32_e32 v5, 1
; GFX9-NEXT:    v_mov_b32_e32 v6, 1
; GFX9-NEXT:    v_mov_b32_e32 v7, 1
; GFX9-NEXT:    v_mov_b32_e32 v8, 1
; GFX9-NEXT:    v_mov_b32_e32 v9, 1
; GFX9-NEXT:    v_mov_b32_e32 v10, 2
; GFX9-NEXT:    v_mov_b32_e32 v11, 2
; GFX9-NEXT:    v_mov_b32_e32 v12, 2
; GFX9-NEXT:    v_mov_b32_e32 v13, 2
; GFX9-NEXT:    v_mov_b32_e32 v14, 2
; GFX9-NEXT:    v_mov_b32_e32 v15, 3
; GFX9-NEXT:    v_mov_b32_e32 v16, 3
; GFX9-NEXT:    v_mov_b32_e32 v17, 3
; GFX9-NEXT:    v_mov_b32_e32 v18, 3
; GFX9-NEXT:    v_mov_b32_e32 v19, 3
; GFX9-NEXT:    v_mov_b32_e32 v20, 4
; GFX9-NEXT:    v_mov_b32_e32 v21, 4
; GFX9-NEXT:    v_mov_b32_e32 v22, 4
; GFX9-NEXT:    v_mov_b32_e32 v23, 4
; GFX9-NEXT:    v_mov_b32_e32 v24, 4
; GFX9-NEXT:    v_mov_b32_e32 v25, 5
; GFX9-NEXT:    v_mov_b32_e32 v26, 5
; GFX9-NEXT:    v_mov_b32_e32 v27, 5
; GFX9-NEXT:    v_mov_b32_e32 v28, 5
; GFX9-NEXT:    v_mov_b32_e32 v29, 5
; GFX9-NEXT:    v_mov_b32_e32 v30, 6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    s_mov_b32 s32, s33
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: stack_8xv5i32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8
; GFX11-NEXT:    v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10
; GFX11-NEXT:    v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14
; GFX11-NEXT:    v_mov_b32_e32 v6, 13
; GFX11-NEXT:    s_add_i32 s0, s32, 32
; GFX11-NEXT:    s_add_i32 s1, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    scratch_store_b32 off, v8, s0
; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s1
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, 1
; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 1
; GFX11-NEXT:    v_dual_mov_b32 v6, 1 :: v_dual_mov_b32 v9, 1
; GFX11-NEXT:    v_dual_mov_b32 v8, 1 :: v_dual_mov_b32 v11, 2
; GFX11-NEXT:    v_dual_mov_b32 v10, 2 :: v_dual_mov_b32 v13, 2
; GFX11-NEXT:    v_dual_mov_b32 v12, 2 :: v_dual_mov_b32 v15, 3
; GFX11-NEXT:    v_dual_mov_b32 v14, 2 :: v_dual_mov_b32 v17, 3
; GFX11-NEXT:    v_dual_mov_b32 v16, 3 :: v_dual_mov_b32 v19, 3
; GFX11-NEXT:    v_dual_mov_b32 v18, 3 :: v_dual_mov_b32 v21, 4
; GFX11-NEXT:    v_dual_mov_b32 v20, 4 :: v_dual_mov_b32 v23, 4
; GFX11-NEXT:    v_dual_mov_b32 v22, 4 :: v_dual_mov_b32 v25, 5
; GFX11-NEXT:    v_dual_mov_b32 v24, 4 :: v_dual_mov_b32 v27, 5
; GFX11-NEXT:    v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v29, 5
; GFX11-NEXT:    v_mov_b32_e32 v28, 5
; GFX11-NEXT:    v_mov_b32_e32 v30, 6
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    s_mov_b32 s32, s33
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; HSA-LABEL: stack_8xv5i32:
; HSA:       ; %bb.0: ; %entry
; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT:    s_mov_b32 s4, s33
; HSA-NEXT:    s_mov_b32 s33, s32
; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HSA-NEXT:    s_mov_b64 exec, s[8:9]
; HSA-NEXT:    s_addk_i32 s32, 0x400
; HSA-NEXT:    v_mov_b32_e32 v0, 7
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT:    v_mov_b32_e32 v0, 8
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT:    v_mov_b32_e32 v0, 9
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; HSA-NEXT:    v_mov_b32_e32 v0, 10
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; HSA-NEXT:    v_mov_b32_e32 v0, 11
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; HSA-NEXT:    v_mov_b32_e32 v0, 12
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; HSA-NEXT:    v_mov_b32_e32 v0, 13
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; HSA-NEXT:    v_mov_b32_e32 v0, 14
; HSA-NEXT:    v_writelane_b32 v40, s4, 2
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; HSA-NEXT:    v_mov_b32_e32 v0, 15
; HSA-NEXT:    v_writelane_b32 v40, s30, 0
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
; HSA-NEXT:    s_getpc_b64 s[4:5]
; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12
; HSA-NEXT:    v_mov_b32_e32 v0, 0
; HSA-NEXT:    v_mov_b32_e32 v1, 0
; HSA-NEXT:    v_mov_b32_e32 v2, 0
; HSA-NEXT:    v_mov_b32_e32 v3, 0
; HSA-NEXT:    v_mov_b32_e32 v4, 0
; HSA-NEXT:    v_mov_b32_e32 v5, 1
; HSA-NEXT:    v_mov_b32_e32 v6, 1
; HSA-NEXT:    v_mov_b32_e32 v7, 1
; HSA-NEXT:    v_mov_b32_e32 v8, 1
; HSA-NEXT:    v_mov_b32_e32 v9, 1
; HSA-NEXT:    v_mov_b32_e32 v10, 2
; HSA-NEXT:    v_mov_b32_e32 v11, 2
; HSA-NEXT:    v_mov_b32_e32 v12, 2
; HSA-NEXT:    v_mov_b32_e32 v13, 2
; HSA-NEXT:    v_mov_b32_e32 v14, 2
; HSA-NEXT:    v_mov_b32_e32 v15, 3
; HSA-NEXT:    v_mov_b32_e32 v16, 3
; HSA-NEXT:    v_mov_b32_e32 v17, 3
; HSA-NEXT:    v_mov_b32_e32 v18, 3
; HSA-NEXT:    v_mov_b32_e32 v19, 3
; HSA-NEXT:    v_mov_b32_e32 v20, 4
; HSA-NEXT:    v_mov_b32_e32 v21, 4
; HSA-NEXT:    v_mov_b32_e32 v22, 4
; HSA-NEXT:    v_mov_b32_e32 v23, 4
; HSA-NEXT:    v_mov_b32_e32 v24, 4
; HSA-NEXT:    v_mov_b32_e32 v25, 5
; HSA-NEXT:    v_mov_b32_e32 v26, 5
; HSA-NEXT:    v_mov_b32_e32 v27, 5
; HSA-NEXT:    v_mov_b32_e32 v28, 5
; HSA-NEXT:    v_mov_b32_e32 v29, 5
; HSA-NEXT:    v_mov_b32_e32 v30, 6
; HSA-NEXT:    v_writelane_b32 v40, s31, 1
; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT:    v_readlane_b32 s31, v40, 1
; HSA-NEXT:    v_readlane_b32 s30, v40, 0
; HSA-NEXT:    s_mov_b32 s32, s33
; HSA-NEXT:    v_readlane_b32 s4, v40, 2
; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; HSA-NEXT:    s_mov_b64 exec, s[6:7]
; HSA-NEXT:    s_mov_b32 s33, s4
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_setpc_b64 s[30:31]
entry:
  call void @external_void_func_8xv5i32(
      <5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>,
      <5 x i32><i32 1, i32 1, i32 1, i32 1, i32 1>,
      <5 x i32><i32 2, i32 2, i32 2, i32 2, i32 2>,
      <5 x i32><i32 3, i32 3, i32 3, i32 3, i32 3>,
      <5 x i32><i32 4, i32 4, i32 4, i32 4, i32 4>,
      <5 x i32><i32 5, i32 5, i32 5, i32 5, i32 5>,
      <5 x i32><i32 6, i32 7, i32 8, i32 9, i32 10>,
      <5 x i32><i32 11, i32 12, i32 13, i32 14, i32 15>)
  ret void
}

define void @stack_8xv5f32() #0 {
; VI-LABEL: stack_8xv5f32:
; VI:       ; %bb.0: ; %entry
; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT:    s_mov_b32 s4, s33
; VI-NEXT:    s_mov_b32 s33, s32
; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; VI-NEXT:    s_mov_b64 exec, s[8:9]
; VI-NEXT:    s_addk_i32 s32, 0x400
; VI-NEXT:    v_mov_b32_e32 v0, 0x40e00000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; VI-NEXT:    v_mov_b32_e32 v0, 0x41000000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; VI-NEXT:    v_mov_b32_e32 v0, 0x41100000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; VI-NEXT:    v_mov_b32_e32 v0, 0x41200000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; VI-NEXT:    v_mov_b32_e32 v0, 0x41300000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; VI-NEXT:    v_mov_b32_e32 v0, 0x41400000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; VI-NEXT:    v_mov_b32_e32 v0, 0x41500000
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; VI-NEXT:    v_mov_b32_e32 v0, 0x41600000
; VI-NEXT:    v_writelane_b32 v40, s4, 2
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; VI-NEXT:    v_mov_b32_e32 v0, 0x41700000
; VI-NEXT:    v_writelane_b32 v40, s30, 0
; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
; VI-NEXT:    s_getpc_b64 s[4:5]
; VI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4
; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12
; VI-NEXT:    v_mov_b32_e32 v0, 0
; VI-NEXT:    v_mov_b32_e32 v1, 0
; VI-NEXT:    v_mov_b32_e32 v2, 0
; VI-NEXT:    v_mov_b32_e32 v3, 0
; VI-NEXT:    v_mov_b32_e32 v4, 0
; VI-NEXT:    v_mov_b32_e32 v5, 1.0
; VI-NEXT:    v_mov_b32_e32 v6, 1.0
; VI-NEXT:    v_mov_b32_e32 v7, 1.0
; VI-NEXT:    v_mov_b32_e32 v8, 1.0
; VI-NEXT:    v_mov_b32_e32 v9, 1.0
; VI-NEXT:    v_mov_b32_e32 v10, 2.0
; VI-NEXT:    v_mov_b32_e32 v11, 2.0
; VI-NEXT:    v_mov_b32_e32 v12, 2.0
; VI-NEXT:    v_mov_b32_e32 v13, 2.0
; VI-NEXT:    v_mov_b32_e32 v14, 2.0
; VI-NEXT:    v_mov_b32_e32 v15, 0x40400000
; VI-NEXT:    v_mov_b32_e32 v16, 0x40400000
; VI-NEXT:    v_mov_b32_e32 v17, 0x40400000
; VI-NEXT:    v_mov_b32_e32 v18, 0x40400000
; VI-NEXT:    v_mov_b32_e32 v19, 0x40400000
; VI-NEXT:    v_mov_b32_e32 v20, 4.0
; VI-NEXT:    v_mov_b32_e32 v21, 4.0
; VI-NEXT:    v_mov_b32_e32 v22, 4.0
; VI-NEXT:    v_mov_b32_e32 v23, 4.0
; VI-NEXT:    v_mov_b32_e32 v24, 4.0
; VI-NEXT:    v_mov_b32_e32 v25, 0x40a00000
; VI-NEXT:    v_mov_b32_e32 v26, 0x40a00000
; VI-NEXT:    v_mov_b32_e32 v27, 0x40a00000
; VI-NEXT:    v_mov_b32_e32 v28, 0x40a00000
; VI-NEXT:    v_mov_b32_e32 v29, 0x40a00000
; VI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; VI-NEXT:    v_writelane_b32 v40, s31, 1
; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT:    v_readlane_b32 s31, v40, 1
; VI-NEXT:    v_readlane_b32 s30, v40, 0
; VI-NEXT:    s_mov_b32 s32, s33
; VI-NEXT:    v_readlane_b32 s4, v40, 2
; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; VI-NEXT:    s_mov_b64 exec, s[6:7]
; VI-NEXT:    s_mov_b32 s33, s4
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_setpc_b64 s[30:31]
;
; CI-LABEL: stack_8xv5f32:
; CI:       ; %bb.0: ; %entry
; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT:    s_mov_b32 s4, s33
; CI-NEXT:    s_mov_b32 s33, s32
; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CI-NEXT:    s_mov_b64 exec, s[8:9]
; CI-NEXT:    s_addk_i32 s32, 0x400
; CI-NEXT:    v_mov_b32_e32 v0, 0x40e00000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; CI-NEXT:    v_mov_b32_e32 v0, 0x41000000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; CI-NEXT:    v_mov_b32_e32 v0, 0x41100000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; CI-NEXT:    v_mov_b32_e32 v0, 0x41200000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; CI-NEXT:    v_mov_b32_e32 v0, 0x41300000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; CI-NEXT:    v_mov_b32_e32 v0, 0x41400000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; CI-NEXT:    v_mov_b32_e32 v0, 0x41500000
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; CI-NEXT:    v_mov_b32_e32 v0, 0x41600000
; CI-NEXT:    v_writelane_b32 v40, s4, 2
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; CI-NEXT:    v_mov_b32_e32 v0, 0x41700000
; CI-NEXT:    v_writelane_b32 v40, s30, 0
; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
; CI-NEXT:    s_getpc_b64 s[4:5]
; CI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4
; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12
; CI-NEXT:    v_mov_b32_e32 v0, 0
; CI-NEXT:    v_mov_b32_e32 v1, 0
; CI-NEXT:    v_mov_b32_e32 v2, 0
; CI-NEXT:    v_mov_b32_e32 v3, 0
; CI-NEXT:    v_mov_b32_e32 v4, 0
; CI-NEXT:    v_mov_b32_e32 v5, 1.0
; CI-NEXT:    v_mov_b32_e32 v6, 1.0
; CI-NEXT:    v_mov_b32_e32 v7, 1.0
; CI-NEXT:    v_mov_b32_e32 v8, 1.0
; CI-NEXT:    v_mov_b32_e32 v9, 1.0
; CI-NEXT:    v_mov_b32_e32 v10, 2.0
; CI-NEXT:    v_mov_b32_e32 v11, 2.0
; CI-NEXT:    v_mov_b32_e32 v12, 2.0
; CI-NEXT:    v_mov_b32_e32 v13, 2.0
; CI-NEXT:    v_mov_b32_e32 v14, 2.0
; CI-NEXT:    v_mov_b32_e32 v15, 0x40400000
; CI-NEXT:    v_mov_b32_e32 v16, 0x40400000
; CI-NEXT:    v_mov_b32_e32 v17, 0x40400000
; CI-NEXT:    v_mov_b32_e32 v18, 0x40400000
; CI-NEXT:    v_mov_b32_e32 v19, 0x40400000
; CI-NEXT:    v_mov_b32_e32 v20, 4.0
; CI-NEXT:    v_mov_b32_e32 v21, 4.0
; CI-NEXT:    v_mov_b32_e32 v22, 4.0
; CI-NEXT:    v_mov_b32_e32 v23, 4.0
; CI-NEXT:    v_mov_b32_e32 v24, 4.0
; CI-NEXT:    v_mov_b32_e32 v25, 0x40a00000
; CI-NEXT:    v_mov_b32_e32 v26, 0x40a00000
; CI-NEXT:    v_mov_b32_e32 v27, 0x40a00000
; CI-NEXT:    v_mov_b32_e32 v28, 0x40a00000
; CI-NEXT:    v_mov_b32_e32 v29, 0x40a00000
; CI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; CI-NEXT:    v_writelane_b32 v40, s31, 1
; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT:    v_readlane_b32 s31, v40, 1
; CI-NEXT:    v_readlane_b32 s30, v40, 0
; CI-NEXT:    s_mov_b32 s32, s33
; CI-NEXT:    v_readlane_b32 s4, v40, 2
; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CI-NEXT:    s_mov_b64 exec, s[6:7]
; CI-NEXT:    s_mov_b32 s33, s4
; CI-NEXT:    s_waitcnt vmcnt(0)
; CI-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: stack_8xv5f32:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s4, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40e00000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41000000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41100000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41200000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41300000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41400000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
; GFX9-NEXT:    s_getpc_b64 s[4:5]
; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 0
; GFX9-NEXT:    v_mov_b32_e32 v4, 0
; GFX9-NEXT:    v_mov_b32_e32 v5, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v6, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v7, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v8, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v9, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v10, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v11, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v12, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v13, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v14, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v15, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v16, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v17, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v18, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v19, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v20, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v21, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v22, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v23, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v24, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v25, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v26, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v27, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v28, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v29, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    s_mov_b32 s32, s33
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: stack_8xv5f32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40e00000
; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41000000
; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41100000
; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41200000
; GFX11-NEXT:    v_mov_b32_e32 v8, 0x41700000
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41300000
; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41400000
; GFX11-NEXT:    v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0
; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41600000
; GFX11-NEXT:    s_add_i32 s0, s32, 32
; GFX11-NEXT:    s_add_i32 s1, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    scratch_store_b32 off, v8, s0
; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s1
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0
; GFX11-NEXT:    v_dual_mov_b32 v6, 1.0 :: v_dual_mov_b32 v7, 1.0
; GFX11-NEXT:    v_dual_mov_b32 v8, 1.0 :: v_dual_mov_b32 v11, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v10, 2.0 :: v_dual_mov_b32 v13, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v12, 2.0 :: v_dual_mov_b32 v15, 0x40400000
; GFX11-NEXT:    v_dual_mov_b32 v14, 2.0 :: v_dual_mov_b32 v17, 0x40400000
; GFX11-NEXT:    v_dual_mov_b32 v16, 0x40400000 :: v_dual_mov_b32 v19, 0x40400000
; GFX11-NEXT:    v_dual_mov_b32 v18, 0x40400000 :: v_dual_mov_b32 v21, 4.0
; GFX11-NEXT:    v_dual_mov_b32 v20, 4.0 :: v_dual_mov_b32 v23, 4.0
; GFX11-NEXT:    v_dual_mov_b32 v22, 4.0 :: v_dual_mov_b32 v25, 0x40a00000
; GFX11-NEXT:    v_dual_mov_b32 v24, 4.0 :: v_dual_mov_b32 v27, 0x40a00000
; GFX11-NEXT:    v_dual_mov_b32 v26, 0x40a00000 :: v_dual_mov_b32 v29, 0x40a00000
; GFX11-NEXT:    v_mov_b32_e32 v28, 0x40a00000
; GFX11-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; GFX11-NEXT:    s_getpc_b64 s[0:1]
; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    s_mov_b32 s32, s33
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; HSA-LABEL: stack_8xv5f32:
; HSA:       ; %bb.0: ; %entry
; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT:    s_mov_b32 s4, s33
; HSA-NEXT:    s_mov_b32 s33, s32
; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HSA-NEXT:    s_mov_b64 exec, s[8:9]
; HSA-NEXT:    s_addk_i32 s32, 0x400
; HSA-NEXT:    v_mov_b32_e32 v0, 0x40e00000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41000000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41100000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41200000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41300000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41400000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41500000
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41600000
; HSA-NEXT:    v_writelane_b32 v40, s4, 2
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; HSA-NEXT:    v_mov_b32_e32 v0, 0x41700000
; HSA-NEXT:    v_writelane_b32 v40, s30, 0
; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
; HSA-NEXT:    s_getpc_b64 s[4:5]
; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4
; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12
; HSA-NEXT:    v_mov_b32_e32 v0, 0
; HSA-NEXT:    v_mov_b32_e32 v1, 0
; HSA-NEXT:    v_mov_b32_e32 v2, 0
; HSA-NEXT:    v_mov_b32_e32 v3, 0
; HSA-NEXT:    v_mov_b32_e32 v4, 0
; HSA-NEXT:    v_mov_b32_e32 v5, 1.0
; HSA-NEXT:    v_mov_b32_e32 v6, 1.0
; HSA-NEXT:    v_mov_b32_e32 v7, 1.0
; HSA-NEXT:    v_mov_b32_e32 v8, 1.0
; HSA-NEXT:    v_mov_b32_e32 v9, 1.0
; HSA-NEXT:    v_mov_b32_e32 v10, 2.0
; HSA-NEXT:    v_mov_b32_e32 v11, 2.0
; HSA-NEXT:    v_mov_b32_e32 v12, 2.0
; HSA-NEXT:    v_mov_b32_e32 v13, 2.0
; HSA-NEXT:    v_mov_b32_e32 v14, 2.0
; HSA-NEXT:    v_mov_b32_e32 v15, 0x40400000
; HSA-NEXT:    v_mov_b32_e32 v16, 0x40400000
; HSA-NEXT:    v_mov_b32_e32 v17, 0x40400000
; HSA-NEXT:    v_mov_b32_e32 v18, 0x40400000
; HSA-NEXT:    v_mov_b32_e32 v19, 0x40400000
; HSA-NEXT:    v_mov_b32_e32 v20, 4.0
; HSA-NEXT:    v_mov_b32_e32 v21, 4.0
; HSA-NEXT:    v_mov_b32_e32 v22, 4.0
; HSA-NEXT:    v_mov_b32_e32 v23, 4.0
; HSA-NEXT:    v_mov_b32_e32 v24, 4.0
; HSA-NEXT:    v_mov_b32_e32 v25, 0x40a00000
; HSA-NEXT:    v_mov_b32_e32 v26, 0x40a00000
; HSA-NEXT:    v_mov_b32_e32 v27, 0x40a00000
; HSA-NEXT:    v_mov_b32_e32 v28, 0x40a00000
; HSA-NEXT:    v_mov_b32_e32 v29, 0x40a00000
; HSA-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; HSA-NEXT:    v_writelane_b32 v40, s31, 1
; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT:    v_readlane_b32 s31, v40, 1
; HSA-NEXT:    v_readlane_b32 s30, v40, 0
; HSA-NEXT:    s_mov_b32 s32, s33
; HSA-NEXT:    v_readlane_b32 s4, v40, 2
; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; HSA-NEXT:    s_mov_b64 exec, s[6:7]
; HSA-NEXT:    s_mov_b32 s33, s4
; HSA-NEXT:    s_waitcnt vmcnt(0)
; HSA-NEXT:    s_setpc_b64 s[30:31]
entry:
  call void @external_void_func_8xv5f32(
      <5 x float><float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,
      <5 x float><float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>,
      <5 x float><float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>,
      <5 x float><float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>,
      <5 x float><float 4.0, float 4.0, float 4.0, float 4.0, float 4.0>,
      <5 x float><float 5.0, float 5.0, float 5.0, float 5.0, float 5.0>,
      <5 x float><float 6.0, float 7.0, float 8.0, float 9.0, float 10.0>,
      <5 x float><float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>)
  ret void
}

declare hidden void @byval_align16_f64_arg(<32 x i32>, ptr addrspace(5) byval(double) align 16) #0
declare hidden void @stack_passed_f64_arg(<32 x i32>, double) #0
declare hidden void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>,
    <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0
declare hidden void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>,
    <5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) #0
declare hidden void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x float>, <3 x float>,
    <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0
declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
    <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0

attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind noinline }

