diff --git a/AGRARIAN_DEVELOPMENT_ROADMAP.md b/AGRARIAN_DEVELOPMENT_ROADMAP.md index 891ba94..29acc7e 100644 --- a/AGRARIAN_DEVELOPMENT_ROADMAP.md +++ b/AGRARIAN_DEVELOPMENT_ROADMAP.md @@ -174,7 +174,7 @@ Remaining version 0.01 cleanup before moving deeper into new gameplay: - [x] Define GitHub/LFS free-tier storage guardrails. - [x] Define backup expectations for NAS and repository. - [x] Implement Linastorage incremental project backup job. -- [ ] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex. +- [x] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex. - [ ] Create repeatable dedicated server build instructions. - [~] Finish required plugin documentation. - [ ] Confirm the project opens cleanly from a fresh checkout, not just the current working share. @@ -313,7 +313,7 @@ Current tooling decisions: - [x] Decide and document VM snapshot cadence before major engine/tool changes. - [x] Define Unraid share backup policy. - [x] Implement Linastorage incremental project backup job with deleted-file retention. -- [ ] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex. +- [x] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex. - [ ] Add recurring restore-test log for project and VM backups. - [ ] Define GitHub branch protection and review rules. - [ ] Add a build log retention policy. @@ -1415,7 +1415,7 @@ Earliest incomplete foundation items: - [x] Define GitHub/LFS free-tier storage guardrails. - [x] Define backup expectations for NAS and repository. - [x] Implement Linastorage incremental project backup job. -- [ ] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex. +- [x] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex. - [ ] Create repeatable dedicated server build instructions. - [~] Finish required plugin documentation. - [ ] Confirm project opens cleanly from a fresh checkout. @@ -1433,4 +1433,4 @@ Earliest incomplete foundation items: Immediate next item: -- [ ] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex. +- [ ] Create repeatable dedicated server build instructions. diff --git a/Docs/Ops/AgrarianVmBackupRunbook.md b/Docs/Ops/AgrarianVmBackupRunbook.md new file mode 100644 index 0000000..aa9da42 --- /dev/null +++ b/Docs/Ops/AgrarianVmBackupRunbook.md @@ -0,0 +1,91 @@ +# Agrarian VM Backup Runbook + +Agrarian VM backups run on Unraid `DevBox` and protect the VM definitions, +NVRAM, and virtual disks for the development/build machines. + +## VMs + +- `Windows-Builder` +- `Ubuntu-Codex` + +## Paths + +- Persistent script: `/boot/config/custom/agrarian-vm-backup.sh` +- Runtime helper path: `/usr/local/sbin/agrarian-vm-backup` +- Backup root: `/mnt/user/backups/agrarian-game/vms` +- Snapshots: `/mnt/user/backups/agrarian-game/vms/snapshots` +- Cron definition: `/boot/config/plugins/dynamix/agrarian-vm-backup.cron` + +## Safety Model + +The scheduled job is safe by default. It skips VMs that are currently running, +because copying a live VM disk image is not considered a reliable backup. + +For a real VM backup during a maintenance window, run: + +```bash +/boot/config/custom/agrarian-vm-backup.sh --shutdown-running +``` + +If running directly from the persistent flash path fails because `/boot` is +mounted noexec, invoke it through bash: + +```bash +/bin/bash /boot/config/custom/agrarian-vm-backup.sh --shutdown-running +``` + +That command gracefully shuts down any running target VM, backs up its XML, +NVRAM, Unraid VM/share config, and compressed `qcow2` disk image, then starts +the VM again after the backup completes. + +Preview what would happen: + +```bash +/bin/bash /boot/config/custom/agrarian-vm-backup.sh --dry-run +``` + +Back up one VM during maintenance: + +```bash +/bin/bash /boot/config/custom/agrarian-vm-backup.sh --shutdown-running --vm Ubuntu-Codex +``` + +## Schedule + +The installed cron job runs weekly. Without `--shutdown-running`, it only backs +up VMs that are already off. This avoids an unexpected Windows build machine or +Ubuntu-Codex outage during active work. + +## Retention + +Snapshots older than 120 days are pruned after a successful backup. Manual +pre-change backups can be copied or renamed before pruning if they need to be +kept longer. + +## Restore Notes + +Each snapshot contains: + +- `MANIFEST.txt` +- `SHA256SUMS` +- VM XML under `vms//xml/` +- NVRAM files under `vms//nvram/` when present +- compressed `qcow2` disk images under `vms//disks/` +- selected Unraid config under `unraid-config/` + +Verify a snapshot: + +```bash +cd /mnt/user/backups/agrarian-game/vms/snapshots/ +sha256sum -c SHA256SUMS +``` + +Inspect a backed-up disk: + +```bash +qemu-img info vms/Ubuntu-Codex/disks/hdc-vdisk1.img.qcow2 +``` + +Restore should be done to a new VM or a test path first. Do not overwrite a +working VM disk until the backup has been verified and the original disk has +been preserved. diff --git a/Operations/unraid/agrarian-vm-backup.cron b/Operations/unraid/agrarian-vm-backup.cron new file mode 100644 index 0000000..8f21011 --- /dev/null +++ b/Operations/unraid/agrarian-vm-backup.cron @@ -0,0 +1,3 @@ +# Weekly safe Agrarian VM backup check. +# Running VMs are skipped unless the script is invoked manually with --shutdown-running. +15 3 * * 0 /bin/bash /boot/config/custom/agrarian-vm-backup.sh >> /var/log/agrarian-vm-backup.log 2>&1 diff --git a/Scripts/agrarian_vm_backup_unraid.sh b/Scripts/agrarian_vm_backup_unraid.sh new file mode 100755 index 0000000..db568c4 --- /dev/null +++ b/Scripts/agrarian_vm_backup_unraid.sh @@ -0,0 +1,300 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +umask 077 + +VM_NAMES_DEFAULT=("Windows-Builder" "Ubuntu-Codex") +BACKUP_ROOT="${AGRARIAN_VM_BACKUP_ROOT:-/mnt/user/backups/agrarian-game/vms}" +SNAPSHOT_ROOT="$BACKUP_ROOT/snapshots" +LOCK_FILE="${AGRARIAN_VM_BACKUP_LOCK:-/var/lock/agrarian-vm-backup.lock}" +SHUTDOWN_TIMEOUT_SECONDS="${AGRARIAN_VM_SHUTDOWN_TIMEOUT_SECONDS:-900}" +RETENTION_DAYS="${AGRARIAN_VM_BACKUP_RETENTION_DAYS:-120}" + +ALLOW_SHUTDOWN=0 +DRY_RUN=0 +START_AFTER=1 +VM_NAMES=("${VM_NAMES_DEFAULT[@]}") +STARTED_BY_SCRIPT=() +BACKED_UP_COUNT=0 + +log() { + printf '[%s] %s\n' "$(date -Is)" "$*" +} + +die() { + log "ERROR: $*" + exit 1 +} + +usage() { + cat <<'USAGE' +Usage: agrarian_vm_backup_unraid.sh [--shutdown-running] [--no-start-after] [--dry-run] [--vm NAME ...] + +Backs up Agrarian development VMs on Unraid to /mnt/user/backups. + +Default behavior is safe for scheduled runs: running VMs are skipped. Use +--shutdown-running during a maintenance window to gracefully stop running VMs, +back them up, and start them again after a successful backup. +USAGE +} + +parse_args() { + local custom_vms=() + + while [[ $# -gt 0 ]]; do + case "$1" in + --shutdown-running) + ALLOW_SHUTDOWN=1 + ;; + --no-start-after) + START_AFTER=0 + ;; + --dry-run) + DRY_RUN=1 + ;; + --vm) + shift + [[ $# -gt 0 ]] || die "--vm requires a VM name" + custom_vms+=("$1") + ;; + -h|--help) + usage + exit 0 + ;; + *) + die "Unknown argument: $1" + ;; + esac + shift + done + + if [[ "${#custom_vms[@]}" -gt 0 ]]; then + VM_NAMES=("${custom_vms[@]}") + fi +} + +require_tools() { + for tool in virsh qemu-img sha256sum find; do + command -v "$tool" >/dev/null || die "$tool is required" + done +} + +vm_state() { + virsh domstate "$1" 2>/dev/null | awk 'NR == 1 {print $0}' +} + +wait_for_shutdown() { + local vm="$1" + local waited=0 + + while [[ "$(vm_state "$vm")" != "shut off" ]]; do + if (( waited >= SHUTDOWN_TIMEOUT_SECONDS )); then + die "$vm did not shut down within ${SHUTDOWN_TIMEOUT_SECONDS}s" + fi + sleep 5 + waited=$((waited + 5)) + done +} + +shutdown_vm_for_backup() { + local vm="$1" + local state="$2" + + [[ "$state" == "running" ]] || return 0 + + if [[ "$ALLOW_SHUTDOWN" != "1" ]]; then + log "Skipping running VM without --shutdown-running: $vm" + return 1 + fi + + log "Gracefully shutting down $vm for backup" + + if [[ "$DRY_RUN" == "1" ]]; then + log "Dry run: would shut down $vm" + return 0 + fi + + virsh shutdown "$vm" + wait_for_shutdown "$vm" + STARTED_BY_SCRIPT+=("$vm") +} + +restart_vms_started_by_script() { + [[ "$START_AFTER" == "1" ]] || return 0 + + local vm + for vm in "${STARTED_BY_SCRIPT[@]}"; do + if [[ "$(vm_state "$vm")" == "shut off" ]]; then + log "Starting $vm after backup" + virsh start "$vm" + fi + done +} + +disk_sources_for_vm() { + local vm="$1" + + virsh domblklist "$vm" --details \ + | awk '$1 == "file" && $2 == "disk" && $4 != "-" {print $3 "\t" $4}' +} + +copy_nvram_if_present() { + local vm="$1" + local dest="$2" + local nvram + + nvram="$(virsh dumpxml "$vm" | sed -n "s:.*]*>\\(.*\\).*:\\1:p" | head -n 1)" + [[ -n "$nvram" && -f "$nvram" ]] || return 0 + + mkdir -p "$dest/nvram" + cp -a "$nvram" "$dest/nvram/" +} + +backup_vm() { + local vm="$1" + local snapshot_dir="$2" + local state + + state="$(vm_state "$vm")" + [[ -n "$state" ]] || die "VM not found: $vm" + + log "Preparing VM backup: $vm ($state)" + + if ! shutdown_vm_for_backup "$vm" "$state"; then + return 0 + fi + + local vm_dir="$snapshot_dir/vms/$vm" + mkdir -p "$vm_dir/disks" "$vm_dir/xml" + + log "Saving VM XML: $vm" + virsh dumpxml "$vm" > "$vm_dir/xml/$vm.xml" + copy_nvram_if_present "$vm" "$vm_dir" + + local disk_target disk_source disk_dest disk_base + while IFS=$'\t' read -r disk_target disk_source; do + [[ -n "$disk_source" ]] || continue + [[ -f "$disk_source" ]] || die "Disk source not found for $vm: $disk_source" + + disk_base="$(basename "$disk_source")" + disk_dest="$vm_dir/disks/${disk_target}-${disk_base}.qcow2" + + log "Converting disk for $vm: $disk_source -> $disk_dest" + + if [[ "$DRY_RUN" == "1" ]]; then + log "Dry run: would qemu-img convert $disk_source" + continue + fi + + qemu-img convert -p -O qcow2 -c "$disk_source" "$disk_dest" + qemu-img info "$disk_dest" > "$disk_dest.info.txt" + done < <(disk_sources_for_vm "$vm") + + BACKED_UP_COUNT=$((BACKED_UP_COUNT + 1)) +} + +write_manifest() { + local snapshot_dir="$1" + local timestamp="$2" + + { + echo "backup_timestamp=$timestamp" + echo "host=$(hostname)" + echo "backup_root=$BACKUP_ROOT" + echo "allow_shutdown=$ALLOW_SHUTDOWN" + echo "start_after=$START_AFTER" + echo + echo "[vms]" + printf '%s\n' "${VM_NAMES[@]}" + echo + echo "[virsh_list]" + virsh list --all + echo + echo "[disk_usage]" + df -h /mnt/user /mnt/cache 2>/dev/null || true + } > "$snapshot_dir/MANIFEST.txt" + + mkdir -p "$snapshot_dir/unraid-config" + cp -a /boot/config/domain.cfg "$snapshot_dir/unraid-config/" 2>/dev/null || true + cp -a /boot/config/vfio-pci.cfg "$snapshot_dir/unraid-config/" 2>/dev/null || true + cp -a /boot/config/shares "$snapshot_dir/unraid-config/" 2>/dev/null || true + + ( + cd "$snapshot_dir" + find . -type f ! -name SHA256SUMS -print0 | sort -z | xargs -0 sha256sum + ) > "$snapshot_dir/SHA256SUMS" +} + +verify_snapshot() { + local snapshot_dir="$1" + + [[ "$DRY_RUN" == "1" ]] && return 0 + + ( + cd "$snapshot_dir" + sha256sum -c SHA256SUMS >/dev/null + ) +} + +prune_old_snapshots() { + [[ "$DRY_RUN" == "1" ]] && return 0 + [[ -d "$SNAPSHOT_ROOT" ]] || return 0 + + find "$SNAPSHOT_ROOT" -mindepth 1 -maxdepth 1 -type d -mtime "+$RETENTION_DAYS" -print0 \ + | while IFS= read -r -d '' old; do + log "Pruning old VM backup: $old" + rm -rf -- "$old" + done +} + +main() { + parse_args "$@" + require_tools + + exec 9>"$LOCK_FILE" + flock -n 9 || { + log "Another Agrarian VM backup is already running; skipping" + exit 0 + } + + trap restart_vms_started_by_script EXIT + + local timestamp snapshot_dir incomplete_dir final_dir + timestamp="$(date +'%Y%m%d-%H%M%S')" + incomplete_dir="$SNAPSHOT_ROOT/.incomplete-$timestamp" + final_dir="$SNAPSHOT_ROOT/$timestamp" + + mkdir -p "$SNAPSHOT_ROOT" + rm -rf -- "$incomplete_dir" "$final_dir" + mkdir -p "$incomplete_dir" + + log "Starting Agrarian VM backup: $timestamp" + + local vm + for vm in "${VM_NAMES[@]}"; do + backup_vm "$vm" "$incomplete_dir" + done + + if [[ "$BACKED_UP_COUNT" == "0" ]]; then + log "No VMs were eligible for backup; no snapshot published" + rm -rf -- "$incomplete_dir" + return 0 + fi + + write_manifest "$incomplete_dir" "$timestamp" + verify_snapshot "$incomplete_dir" + + if [[ "$DRY_RUN" == "1" ]]; then + log "Dry run complete; no snapshot published" + rm -rf -- "$incomplete_dir" + return 0 + fi + + mv "$incomplete_dir" "$final_dir" + printf '%s\n' "$timestamp" > "$BACKUP_ROOT/LATEST.txt" + prune_old_snapshots + + log "Agrarian VM backup completed: $final_dir" +} + +main "$@"