Skip to content

Commit

Permalink
Feature: System resources were not exposed
Browse files Browse the repository at this point in the history
The scheduling of persistent VMs requires external services to fetch
the available system resources of the host.

Solution: Add a new HTTP endpoint on `/about/usage/system`
that exposes system resources and system properties of
the host machine.
  • Loading branch information
hoh committed Sep 13, 2022
1 parent a46af2e commit 32e89dc
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 2 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test-on-droplet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ jobs:
export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci --output json | ./.github/scripts/extract_droplet_ipv4.py)"
sleep 3
curl --retry 5 "http://${DROPLET_IPV4}:4020/about/usage/system"
curl --retry 5 "http://${DROPLET_IPV4}:4020/status/check/fastapi"
- name: Cleanup
Expand Down
2 changes: 1 addition & 1 deletion docker/vm_supervisor-dev.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ FROM debian:bullseye
RUN apt-get update && apt-get -y upgrade && apt-get install -y \
sudo acl curl squashfs-tools git \
python3 python3-aiohttp python3-msgpack python3-pip python3-aiodns python3-aioredis \
python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging \
python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo \
&& rm -rf /var/lib/apt/lists/*

RUN useradd jailman
Expand Down
2 changes: 1 addition & 1 deletion packaging/aleph-vm/DEBIAN/control
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ Version: 0.1.8
Architecture: all
Maintainer: Aleph.im
Description: Aleph.im VM execution engine
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo
Section: aleph-im
Priority: Extra
132 changes: 132 additions & 0 deletions vm_supervisor/resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from datetime import datetime, timezone
from functools import lru_cache
from typing import Tuple

import cpuinfo
import psutil
from aiohttp import web
from aleph_message.models.program import CpuProperties
from pydantic import BaseModel

from .conf import settings


class Period(BaseModel):
datetime: datetime


class LoadAverage(BaseModel):
load1: float
load5: float
load15: float

@classmethod
def from_psutil(cls, psutil_loadavg: Tuple[float, float, float]):
return cls(load1=psutil_loadavg[0],
load5=psutil_loadavg[1],
load15=psutil_loadavg[2],
)


class CoreFrequencies(BaseModel):
min: float
max: float

@classmethod
def from_psutil(cls, psutil_freq: psutil._common.scpufreq):
min = psutil_freq.min or psutil_freq.current
max = psutil_freq.max or psutil_freq.current
return cls(min=min, max=max)


class CpuUsage(BaseModel):
count: int
load_average: LoadAverage
core_frequencies: CoreFrequencies


class MemoryUsage(BaseModel):
total_kB: int
available_kB: int

@property
def available_MB(self) -> float:
return self.available_kB / 1000


class DiskUsage(BaseModel):
total_kB: int
available_kB: int

@property
def available_MB(self) -> float:
return self.available_kB / 1000


class UsagePeriod(BaseModel):
start_timestamp: datetime
duration_seconds: float

class Config:
json_encoders = {
datetime: lambda v: v.isoformat(),
}


class MachineProperties(BaseModel):
cpu: CpuProperties


class MachineUsage(BaseModel):
cpu: CpuUsage
mem: MemoryUsage
disk: DiskUsage
period: UsagePeriod
properties: MachineProperties
active: bool = True


@lru_cache
def get_machine_properties() -> MachineProperties:
"""Fetch machine properties such as architectyre, CPU vendor, ...
These should not change while the supervisor is running.
In the future, some properties may have to be fetched from within a VM.
"""
cpu_info = cpuinfo.get_cpu_info() # Slow
return MachineProperties(
cpu=CpuProperties(
architecture=cpu_info['raw_arch_string'],
vendor=cpu_info['vendor_id'],
),
)


async def about_system_usage(request: web.Request):
period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0)

usage: MachineUsage = MachineUsage(
cpu=CpuUsage(
count=psutil.cpu_count(),
load_average=LoadAverage.from_psutil(psutil.getloadavg()),
core_frequencies=CoreFrequencies.from_psutil(psutil.cpu_freq()),
),
mem=MemoryUsage(
total_kB=psutil.virtual_memory().total / 1000,
available_kB=psutil.virtual_memory().available / 1000,
),
disk=DiskUsage(
total_kB=psutil.disk_usage(settings.PERSISTENT_VOLUMES_DIR).total
// 1000, # 10 GB,
available_kB=psutil.disk_usage(settings.PERSISTENT_VOLUMES_DIR).free
// 1000, # 9 GB
),
period=UsagePeriod(
start_timestamp=period_start,
duration_seconds=60,
),
properties=get_machine_properties(),
)
return web.json_response(
text=usage.json(exclude_none=True),
)
3 changes: 3 additions & 0 deletions vm_supervisor/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
from typing import Awaitable, Callable

from aiohttp import web
from aiohttp.web_exceptions import HTTPBadRequest

from . import __version__
from . import metrics
from .conf import settings
from .resources import about_system_usage
from .run import pool
from .tasks import start_watch_for_messages_task, stop_watch_for_messages_task
from .views import (
Expand Down Expand Up @@ -49,6 +51,7 @@ async def server_version_middleware(
web.get("/about/login", about_login),
web.get("/about/executions", about_executions),
web.get("/about/executions/records", about_execution_records),
web.get("/about/usage/system", about_system_usage),
web.get("/about/config", about_config),
web.get("/status/check/fastapi", status_check_fastapi),
web.get("/status/check/version", status_check_version),
Expand Down

0 comments on commit 32e89dc

Please sign in to comment.