Performance Tips 
Use Caching Memory Allocator 
Julia relies on Garbage-Collection (GC) for memory management, because of that it will not free ROCArrays immediately as they are no longer used.
In tight loops with lots of GPU allocations this will put a lot of pressure on the allocator and may interfere with other programs that use that GPU.
Using caching allocator gives users a precise control of GPU memory, even if the allocations happen in code that's not controlled by the user.
You should use GPUArrays.@cached around parts of the code with repeating memory allocation patterns, for example, on the training loops.
Example below requires 8 MiB of VRAM per iteration. In the worst case (if GC does not kick in) it would quickly fill up to 8 GiB of VRAM, but with caching allocator it uses exactly 8 MiB, which you can then immediately free to reclaim the memory.
julia> cache = GPUArrays.AllocCache()
AllocCache(n_free=0, n_busy=0, sizeof=0 bytes)
julia> for i in 1:1000
           GPUArrays.@cached cache begin
               sin.(AMDGPU.rand(Float32, 1024^2))
           end
       end
julia> cache
AllocCache(n_free=2, n_busy=0, sizeof=8.000 MiB)
julia> GPUArrays.unsafe_free!(cache)For a more sophisticated real-world example, see how GaussianSplatting.jl handles it.
Using SIMD 
Using vectorized load/store instructions can improve performance of the kernel. Let's see on a simple vector addition example how to use it.
We define two helper functions:
- vloadthat will load SIMD tile given a pointer into the array;
- vstore!that will write SIMD tile into an array given its pointer.
using AMDGPU, SIMD
@inline function vload(::Type{SIMD.Vec{N, T}}, ptr::Core.LLVMPtr{T, AS}) where {N, T, AS}
    alignment = sizeof(T) * N
    vec_ptr = Base.bitcast(Core.LLVMPtr{SIMD.Vec{N, T}, AS}, ptr)
    return unsafe_load(vec_ptr, 1, Val(alignment))
end
@inline function vstore!(ptr::Core.LLVMPtr{T, AS}, x::SIMD.Vec{N, T}) where {N, T, AS}
    alignment = sizeof(T) * N
    vec_ptr = Base.bitcast(Core.LLVMPtr{SIMD.Vec{N, T}, AS}, ptr)
    unsafe_store!(vec_ptr, x, 1, Val(alignment))
    return
end
function vadd_simd!(c::AbstractVector{T}, a, b, ::Val{tile_size}) where {T, tile_size}
    i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x
    tile_idx = (i - 1) * tile_size + 1
    a_ptr = pointer(a, tile_idx)
    b_ptr = pointer(b, tile_idx)
    c_ptr = pointer(c, tile_idx)
    a_tile = vload(SIMD.Vec{tile_size, T}, a_ptr)
    b_tile = vload(SIMD.Vec{tile_size, T}, b_ptr)
    vstore!(c_ptr, a_tile + b_tile)
    return
end
n = 1024
tile_size = 4
a = ROCArray(ones(Int, n))
b = ROCArray(ones(Int, n))
c = ROCArray(zeros(Int, n))
groupsize = 256
gridsize = cld(n ÷ tile_size, groupsize)
@roc groupsize=groupsize gridsize=gridsize vadd_simd!(c, a, b, Val(tile_size))
@assert c == (a .+ b)Examining LLVM IR, we can see vectorized load <4 x i64>, add <4 x i64> and store <4 x i64> instructions:
AMDGPU.@device_code_llvm @roc launch=false vadd_simd!(c, a, b, Val(tile_size));; GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}(MethodInstance for Main.var"Main".vadd_simd!(::AMDGPU.Device.ROCDeviceVector{Int64, 1}, ::AMDGPU.Device.ROCDeviceVector{Int64, 1}, ::AMDGPU.Device.ROCDeviceVector{Int64, 1}, ::Val{4}), CompilerConfig for GPUCompiler.GCNCompilerTarget, 0x0000000000007ba5)
;  @ perf.md:73 within `vadd_simd!`
define amdgpu_kernel void @_Z10vadd_simd_14ROCDeviceArrayI5Int64Li1ELi1EES1_S1_3ValILi4EE([5 x i64] %state, { [1 x i64], i8 addrspace(1)*, i64 } %0, { [1 x i64], i8 addrspace(1)*, i64 } %1, { [1 x i64], i8 addrspace(1)*, i64 } %2) local_unnamed_addr #1 {
conversion:
  %.fca.1.extract11 = extractvalue { [1 x i64], i8 addrspace(1)*, i64 } %0, 1
  %.fca.1.extract5 = extractvalue { [1 x i64], i8 addrspace(1)*, i64 } %1, 1
  %.fca.1.extract = extractvalue { [1 x i64], i8 addrspace(1)*, i64 } %2, 1
;  @ perf.md:74 within `vadd_simd!`
; ┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:122 within `workitemIdx`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:87 within `workitemIdx_x`
; ││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:3 within `_index`
; │││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:3 within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %3 = call i32 @llvm.amdgcn.workitem.id.x()
; ││└└
; ││┌ @ int.jl:1074 within `+` @ int.jl:87
     %4 = add nuw nsw i32 %3, 1
; └└└
; ┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:130 within `workgroupIdx`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:93 within `workgroupIdx_x`
; ││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:3 within `_index`
; │││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:3 within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %5 = call i32 @llvm.amdgcn.workgroup.id.x()
; └└└└
; ┌ @ int.jl:1074 within `-` @ int.jl:86
   %6 = zext i32 %5 to i64
; └
; ┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:138 within `workgroupDim`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:101 within `workgroupDim_x`
; ││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:32 within `_dim`
; │││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/indexing.jl:32 within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %7 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
      %8 = getelementptr inbounds i8, i8 addrspace(4)* %7, i64 4
      %9 = bitcast i8 addrspace(4)* %8 to i16 addrspace(4)*
      %10 = load i16, i16 addrspace(4)* %9, align 4
; └└└└
; ┌ @ int.jl:1072 within `*`
; │┌ @ int.jl:557 within `rem`
; ││┌ @ number.jl:7 within `convert`
; │││┌ @ boot.jl:784 within `Int64`
; ││││┌ @ boot.jl:708 within `toInt64`
       %11 = zext i16 %10 to i64
; │└└└└
; │ @ int.jl:1074 within `*` @ int.jl:88
   %12 = mul nuw nsw i64 %11, %6
; └
; ┌ @ int.jl:1072 within `+`
; │┌ @ int.jl:557 within `rem`
; ││┌ @ number.jl:7 within `convert`
; │││┌ @ boot.jl:784 within `Int64`
; ││││┌ @ boot.jl:708 within `toInt64`
       %13 = zext i32 %4 to i64
; │└└└└
; │ @ int.jl:1074 within `+` @ int.jl:87
   %14 = add nuw nsw i64 %12, %13
; └
;  @ perf.md:75 within `vadd_simd!`
; ┌ @ int.jl:88 within `*`
   %15 = shl nuw nsw i64 %14, 5
; └
;  @ perf.md:77 within `vadd_simd!`
; ┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/array.jl:61 within `pointer`
; │┌ @ abstractarray.jl:1243 within `_memory_offset`
; ││┌ @ int.jl:88 within `*`
     %16 = add nsw i64 %15, -32
; │└└
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:147 within `+`
; ││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:114 within `add_ptr`
; │││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:114 within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %17 = getelementptr i8, i8 addrspace(1)* %.fca.1.extract5, i64 %16
; └└└└
;  @ perf.md:78 within `vadd_simd!`
; ┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/array.jl:61 within `pointer`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:147 within `+`
; ││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:114 within `add_ptr`
; │││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:114 within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %18 = getelementptr i8, i8 addrspace(1)* %.fca.1.extract, i64 %16
; └└└└
;  @ perf.md:79 within `vadd_simd!`
; ┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/AMDGPU/TqRG0/src/device/gcn/array.jl:61 within `pointer`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:147 within `+`
; ││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:114 within `add_ptr`
; │││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:114 within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %19 = getelementptr i8, i8 addrspace(1)* %.fca.1.extract11, i64 %16
; └└└└
;  @ perf.md:81 within `vadd_simd!`
; ┌ @ perf.md:63 within `vload`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:85 within `unsafe_load`
; ││┌ @ none within `pointerref`
; │││┌ @ none within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %20 = bitcast i8 addrspace(1)* %17 to <4 x i64> addrspace(1)*
      %.unpack = load <4 x i64>, <4 x i64> addrspace(1)* %20, align 32
; └└└└
;  @ perf.md:82 within `vadd_simd!`
; ┌ @ perf.md:63 within `vload`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:85 within `unsafe_load`
; ││┌ @ none within `pointerref`
; │││┌ @ none within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %21 = bitcast i8 addrspace(1)* %18 to <4 x i64> addrspace(1)*
      %.unpack16 = load <4 x i64>, <4 x i64> addrspace(1)* %21, align 32
; └└└└
;  @ perf.md:83 within `vadd_simd!`
; ┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/SIMD/UiGbs/src/simdvec.jl:264 within `+`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/SIMD/UiGbs/src/LLVM_intrinsics.jl:242 within `add` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/SIMD/UiGbs/src/LLVM_intrinsics.jl:242
; ││┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/SIMD/UiGbs/src/LLVM_intrinsics.jl:250 within `macro expansion`
     %22 = add <4 x i64> %.unpack16, %.unpack
; └└└
; ┌ @ perf.md:69 within `vstore!`
; │┌ @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/pointer.jl:88 within `unsafe_store!`
; ││┌ @ none within `pointerset`
; │││┌ @ none within `macro expansion` @ /root/.cache/julia-buildkite-plugin/depots/687d1932-34cc-406b-8aac-ab7952bcde26/packages/LLVM/iza6e/src/interop/base.jl:39
      %23 = bitcast i8 addrspace(1)* %19 to <4 x i64> addrspace(1)*
      store <4 x i64> %22, <4 x i64> addrspace(1)* %23, align 32
; └└└└
;  @ perf.md:84 within `vadd_simd!`
  ret void
}