PTsolvers · utkinis · Oct 5, 2024 · Sep 30, 2024 · Sep 30, 2024 · Oct 1, 2024
diff --git a/.buildkite/run_tests.yml b/.buildkite/run_tests.yml
@@ -18,13 +18,15 @@ steps:
 
  julia -e 'println("+++ :julia: Running tests")
  using Pkg
- Pkg.test("Chmy"; test_args=["--backend=CUDA"], coverage=true)'
+ Pkg.test("Chmy"; test_args=["--backends=CUDA"], coverage=true)'
  agents:
  queue: "juliagpu"
  cuda: "*"
  timeout_in_minutes: 120
  soft_fail:
  - exit_status: 3
+ env:
+ JULIA_NUM_THREADS: 4
 
  - label: "AMDGPU Julia {{matrix.version}}"
  matrix:
@@ -44,7 +46,7 @@ steps:
 
  julia -e 'println("+++ :julia: Running tests")
  using Pkg
- Pkg.test("Chmy"; test_args=["--backend=AMDGPU"], coverage=true)'
+ Pkg.test("Chmy"; test_args=["--backends=AMDGPU"], coverage=true)'
  agents:
  queue: "juliagpu"
  rocm: "*"
@@ -54,5 +56,36 @@ steps:
  - exit_status: 3
  env:
  JULIA_NUM_THREADS: 4
+
+# We cannot sumbit coverage right now for Metal as this would require using a cryptic setup not enabled here.
+ - label: "Metal Julia {{matrix.version}}"
+ matrix:
+ setup:
+ version:
+ - "1.10"
+ - "1.11"
+ plugins:
+ - JuliaCI/julia#v1:
+ version: "{{matrix.version}}"
+ # - JuliaCI/julia-coverage#v1:
+ # codecov: false
+ command: |
+ julia -e 'println("--- :julia: Instantiating project")
+ using Pkg
+ Pkg.develop(; path=pwd())' || exit 3
+
+ julia -e 'println("+++ :julia: Running tests")
+ using Pkg
+ Pkg.test("Chmy"; test_args=["--backends=Metal"], coverage=false)'
+ agents:
+ queue: "juliaecosystem"
+ os: "macos"
+ arch: "aarch64"
+ timeout_in_minutes: 60
+ soft_fail:
+ - exit_status: 3
+ env:
+ JULIA_NUM_THREADS: 4
+
 env:
  SECRET_CODECOV_TOKEN: "D2H/GglFTcK7SKyfuO/Fy34xrVWHzXbtGTGQXAA3wpEPNAATGhHO/mIm0ILLzhMZSI1LplJBxJ7nV5WVsky0e/01nbSnW5iB0QqFHK8rD+lXUr4ls4zMlyUa0Lvsl/HixFyhwBtFhy8ruwUsqN8AbJNSJSiF9x4jXhzTgIvlO25/HqQObcfJa6qwcw0m9uMa3K26w1xrPhdE7F4mdUUREjB1W8dzfkKF+vZUeMqYFKgit21uQ9QsRjDJl0ExOEw0SC910rtGHtDO0bpIe+D1nEGQsQr8VEN3o0hOCgTJrya8MFitBqkKeVBV/NUImu4UtxlNb7r0ZrjTawiFle2tfg==;U2FsdGVkX1+sdgrm8OBTX9elIdJMwLMpOvXFFtHrG9lj5J8qDBdbjJDva3XMXkbF6I4PCh9G9NW0pEcF9ghb7g=="
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Chmy"
 uuid = "33a72cf0-4690-46d7-b987-06506c2248b9"
 authors = ["Ivan Utkin <[email protected]>, Ludovic Raess <[email protected]>, and contributors"]
-version = "0.1.19"
+version = "0.1.20"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -13,10 +13,12 @@ MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 
 [extensions]
 ChmyAMDGPUExt = "AMDGPU"
 ChmyCUDAExt = "CUDA"
+ChmyMetalExt = "Metal"
 
 [compat]
 AMDGPU = "0.8, 0.9, 1"
@@ -25,4 +27,5 @@ CUDA = "5"
 KernelAbstractions = "0.9"
 MPI = "0.20"
 MacroTools = "0.5"
+Metal = "1"
 julia = "1.9"
diff --git a/docs/src/concepts/architectures.md b/docs/src/concepts/architectures.md
@@ -2,7 +2,7 @@
 
 ## Backend Selection & Architecture Initialization
 
-Chmy.jl supports CPUs, as well as CUDA and ROC backends for Nvidia and AMD GPUs through a thin wrapper around the [`KernelAbstractions.jl`](https:/JuliaGPU/KernelAbstractions.jl) for users to select desirable backends.
+Chmy.jl supports CPUs, as well as CUDA, ROC and Metal backends for Nvidia, AMD and Apple M-series GPUs through a thin wrapper around the [`KernelAbstractions.jl`](https:/JuliaGPU/KernelAbstractions.jl) for users to select desirable backends.
 
 ```julia
 # Default with CPU
@@ -21,6 +21,12 @@ using AMDGPU
 arch = Arch(ROCBackend())
 ```
 
+```julia
+using Metal
+
+arch = Arch(MetalBackend())
+```
+
 At the beginning of program, one may specify the backend and initialize the architecture they desire to use. The initialized `arch` variable will be required explicitly at creation of some objects such as grids and kernel launchers.
 
 ## Specifying the device ID and stream priority

diff --git a/docs/src/concepts/grids.md b/docs/src/concepts/grids.md
@@ -48,6 +48,9 @@ grid = UniformGrid(arch;
  dims=(nx, ny, nz))
 ```
 
+!!! warning "Metal backend"
+ If using the Metal backend, ensure to use `Float32` (`f0`) element types in the `origin` and `extent` tuples when initialising the grid.
+
 !!! info "Interactive Grid Visualization"
  - [grids_2d.jl](https:/PTsolvers/Chmy.jl/blob/main/examples/grids_2d.jl): Visualization of a 2D `StructuredGrid`
  - [grids_3d.jl](https:/PTsolvers/Chmy.jl/blob/main/examples/grids_3d.jl): Visualization of a 3D `StructuredGrid`

diff --git a/docs/src/examples/overview.md b/docs/src/examples/overview.md
@@ -1,15 +1,16 @@
 # Examples Overview
 
-This page provides an overview of [Chmy.jl](https:/PTsolvers/Chmy.jl) examples. These selected examples demonstrate how [Chmy.jl](https:/PTsolvers/Chmy.jl) can be used to solve various numerical problems using architecture-agnostic kernels both on a single-device and in a distributed way.
+This page provides an overview of [Chmy.jl](https:/PTsolvers/Chmy.jl) examples. These selected examples demonstrate how Chmy.jl can be used to solve various numerical problems using architecture-agnostic kernels both on a single-device and in a distributed way.
 
 ## Table of Contents
 
-| Example | Description | 
+| Example  | Description |
 |:------------|:------------|
-| [Diffusion 2D](https:/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d.jl) | Solving the 2D diffusion equation on an uniform grid. |
-| [Diffusion 2D with MPI](https:/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_mpi.jl) | Solving the 2D diffusion equation on an uniform grid distributedly using MPI. |
-| [Single-Device Performance Optimization](https:/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_perf.jl) | Revisiting the 2D diffusion problem with focus on performance optimization techniques on a single-device architecture | 
-| [Stokes 2D with MPI](https:/PTsolvers/Chmy.jl/blob/main/examples/stokes_2d_inc_ve_T_mpi.jl) | Solving the 2D Stokes equation with thermal coupling on an uniform grid. | 
-| [Stokes 3D with MPI](https:/PTsolvers/Chmy.jl/blob/main/examples/stokes_3d_inc_ve_T_mpi.jl) | Solving the 3D Stokes equation with thermal coupling on an uniform grid distributedly using MPI. | 
-| [2D Grid Visualization](https:/PTsolvers/Chmy.jl/blob/main/examples/grids_2d.jl) | Visualization of a 2D `StructuredGrid`. | 
-| [3D Grid Visualization](https:/PTsolvers/Chmy.jl/blob/main/examples/grids_3d.jl) | Visualization of a 3D `StructuredGrid` | 
+| [Diffusion 2D](https:/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d.jl) | Solving the 2D diffusion equation on a uniform grid. |
+| [Diffusion 2D with MPI](https:/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_mpi.jl) | Solving the 2D diffusion equation on a uniform grid and distributed parallelisation using MPI. |
+| [Single-Device Performance Optimisation](https:/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_perf.jl) | Revisiting the 2D diffusion problem with focus on performance optimisation techniques on a single-device architecture. |
+| [Stokes 2D with MPI](https:/PTsolvers/Chmy.jl/blob/main/examples/stokes_2d_inc_ve_T_mpi.jl) | Solving the 2D Stokes equation with thermal coupling on a uniform grid. |
+| [Stokes 3D with MPI](https:/PTsolvers/Chmy.jl/blob/main/examples/stokes_3d_inc_ve_T_mpi.jl) | Solving the 3D Stokes equation with thermal coupling on a uniform grid and distributed parallelisation using MPI. |
+| [Diffusion 1D with Metal](https:/PTsolvers/Chmy.jl/blob/main/examples/diffusion_1d_mtl.jl) | Solving the 1D diffusion equation using the Metal backend and single precision (`Float32`) on a uniform grid. |
+| [2D Grid Visualization](https:/PTsolvers/Chmy.jl/blob/main/examples/grids_2d.jl) | Visualization of a 2D `StructuredGrid`. |
+| [3D Grid Visualization](https:/PTsolvers/Chmy.jl/blob/main/examples/grids_3d.jl) | Visualization of a 3D `StructuredGrid`. |
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -47,6 +47,7 @@ using KernelAbstractions # for backend-agnostic kernels
 using Printf, CairoMakie # for I/O and plotting
 # using CUDA
 # using AMDGPU
+# using Metal
 ```
 
 In this introductory tutorial, we will use the CPU backend for simplicity:
@@ -56,7 +57,10 @@ backend = CPU()
 arch = Arch(backend)
 ```
 
-If a different backend is desired, one needs to load the relevant package accordingly. For example, if Nvidia or AMD GPUs are available, one can comment out `using CUDA` or `using AMDGPU` and make sure to use `arch = Arch(CUDABackend())` or `arch = Arch(ROCBackend())`, respectively, when selecting the architecture. For further information about executing on a single-device or multi-device architecture, see the documentation section for [Architectures](./concepts/architectures.md)
+If a different backend is desired, one needs to load the relevant package accordingly. For example, if Nvidia or AMD GPUs are available, one can comment out `using CUDA`, `using AMDGPU` or `using Metal` and make sure to use `arch = Arch(CUDABackend())`, `arch = Arch(ROCBackend())` or `arch = Arch(MetalBackend())`, respectively, when selecting the architecture. For further information about executing on a single-device or multi-device architecture, see the documentation section for [Architectures](./concepts/architectures.md).
+
+!!! warning "Metal backend"
+ Metal backend restricts floating point arithmetic precision of computations to `Float32` or lower. In Chmy, this can be achieved by initialising the grid object using `Float32` (`f0`) elements in the `origin` and `extent` tuples.
 
 ## Writing & Launch Compute Kernels
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -31,8 +31,9 @@ Chmy.jl provides a comprehensive framework for handling complex computational ta
 
 A general list of the features is:
 
+- Backend-agnostic capabilities leveraging [KernelAbstractions.jl](https:/JuliaGPU/KernelAbstractions.jl)
 - Distributed computing support with [MPI.jl](https:/JuliaParallel/MPI.jl)
-- Multi-dimensional, parameterizable discrete and continuous fields on structured grids
+- Multi-dimensional, parametrisable discrete and continuous fields on structured grids
 - High-level interface for specifying boundary conditions with automatic batching for performance
 - Finite difference and interpolation operators on discrete fields
 - Extensibility; The package is written in pure Julia, so adding new functions, simplification rules, and model transformations has no barrier

diff --git a/examples/diffusion_1d_mtl.jl b/examples/diffusion_1d_mtl.jl
@@ -0,0 +1,56 @@
+using Chmy, Chmy.Architectures, Chmy.Grids, Chmy.Fields, Chmy.BoundaryConditions, Chmy.GridOperators, Chmy.KernelLaunch
+using KernelAbstractions
+using Printf
+using CairoMakie
+
+using Metal
+
+@kernel inbounds = true function compute_q!(q, C, χ, g::StructuredGrid, O)
+ I = @index(Global, NTuple)
+ I = I + O
+ q.x[I...] = -χ * ∂x(C, g, I...)
+end
+
+@kernel inbounds = true function update_C!(C, q, Δt, g::StructuredGrid, O)
+ I = @index(Global, NTuple)
+ I = I + O
+ C[I...] -= Δt * divg(q, g, I...)
+end
+
+@views function main(backend=CPU(); nx=(32, ))
+ arch = Arch(backend)
+ # geometry
+ grid = UniformGrid(arch; origin=(-1f0, ), extent=(2f0, ), dims=nx)
+ launch = Launcher(arch, grid; outer_width=(4, ))
+ # physics
+ χ = 1.0f0
+ # numerics
+ Δt = minimum(spacing(grid))^2 / χ / ndims(grid) / 2.1f0
+ nt = 100
+ # allocate fields
+ C = Field(backend, grid, Center())
+ q = VectorField(backend, grid)
+ # initial conditions
+ set!(C, rand(Float32, size(C)))
+ bc!(arch, grid, C => Neumann())
+ # visualisation
+ fig = Figure(; size=(400, 320))
+ ax = Axis(fig[1, 1]; xlabel="x", ylabel="y", title="it = 0")
+ plt = lines!(ax, centers(grid)..., interior(C) |> Array)
+ display(fig)
+ # action
+ for it in 1:nt
+ @printf("it = %d/%d \n", it, nt)
+ launch(arch, grid, compute_q! => (q, C, χ, grid))
+ launch(arch, grid, update_C! => (C, q, Δt, grid); bc=batch(grid, C => Neumann()))
+ end
+ KernelAbstractions.synchronize(backend)
+ plt[2] = interior(C) |> Array
+ ax.title = "it = $nt"
+ display(fig)
+ return
+end
+
+n = 64
+
+main(MetalBackend(); nx=(n, ) .- 2)
diff --git a/ext/ChmyAMDGPUExt/ChmyAMDGPUExt.jl b/ext/ChmyAMDGPUExt/ChmyAMDGPUExt.jl
@@ -1,6 +1,6 @@
 module ChmyAMDGPUExt
 
-using AMDGPU, KernelAbstractions, Chmy
+using AMDGPU, KernelAbstractions
 
 import Chmy.Architectures: heuristic_groupsize, set_device!, get_device, pointertype
 

diff --git a/ext/ChmyMetalExt/ChmyMetalExt.jl b/ext/ChmyMetalExt/ChmyMetalExt.jl
@@ -0,0 +1,19 @@
+module ChmyMetalExt
+
+using Metal, KernelAbstractions
+
+import Chmy.Architectures: heuristic_groupsize, set_device!, get_device, pointertype
+
+Base.unsafe_wrap(::MetalBackend, ptr::Metal.MtlPtr, dims) = unsafe_wrap(MtlArray, ptr, dims)
+
+pointertype(::MetalBackend, T::DataType) = Metal.MtlPtr{T}
+
+set_device!(dev::Metal.MTL.MTLDeviceInstance) = Metal.device!(dev)
+
+get_device(::MetalBackend, id::Integer) = Metal.MTL.MTLDevice(id)
+
+heuristic_groupsize(::MetalBackend, ::Val{1}) = (256,)
+heuristic_groupsize(::MetalBackend, ::Val{2}) = (32, 8)
+heuristic_groupsize(::MetalBackend, ::Val{3}) = (32, 8, 1)
+
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -8,9 +8,11 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 
 [compat]
 AMDGPU = "0.8, 0.9, 1"
 CUDA = "5"
 KernelAbstractions = "0.9"
 MPI = "0.20"
+Metal = "1"
diff --git a/test/common.jl b/test/common.jl
@@ -3,13 +3,45 @@ using Chmy
 
 using KernelAbstractions
 
-# add KA backends
-backends = KernelAbstractions.Backend[CPU()]
+compatible(::Backend, ::DataType) = true
 
-if get(ENV, "JULIA_CHMY_BACKEND", "") == "AMDGPU"
- using AMDGPU
- AMDGPU.functional() && push!(backends, ROCBackend())
-elseif get(ENV, "JULIA_CHMY_BACKEND", "") == "CUDA"
+# number types to test
+TEST_TYPES = [Float32, Float64]
+
+# add backends
+TEST_BACKENDS = []
+
+if haskey(ENV, "JULIA_CHMY_BACKEND_CPU")
+ push!(TEST_BACKENDS, CPU())
+end
+
+if haskey(ENV, "JULIA_CHMY_BACKEND_CUDA")
  using CUDA
- CUDA.functional() && push!(backends, CUDABackend())
+ if CUDA.functional()
+ push!(TEST_BACKENDS, CUDABackend())
+ end
+end
+
+if haskey(ENV, "JULIA_CHMY_BACKEND_AMDGPU")
+ using AMDGPU
+ if AMDGPU.functional()
+ push!(TEST_BACKENDS, ROCBackend())
+ end
+end
+
+if haskey(ENV, "JULIA_CHMY_BACKEND_Metal")
+ using Metal
+
+ function compatible(::MetalBackend, T::DataType)
+ try
+ Metal.check_eltype(T)
+ return true
+ catch
+ return false
+ end
+ end
+
+ if Metal.functional()
+ push!(TEST_BACKENDS, MetalBackend())
+ end
 end