Implement Unraid VM backup job
This commit is contained in:
@@ -174,7 +174,7 @@ Remaining version 0.01 cleanup before moving deeper into new gameplay:
|
||||
- [x] Define GitHub/LFS free-tier storage guardrails.
|
||||
- [x] Define backup expectations for NAS and repository.
|
||||
- [x] Implement Linastorage incremental project backup job.
|
||||
- [ ] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex.
|
||||
- [x] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex.
|
||||
- [ ] Create repeatable dedicated server build instructions.
|
||||
- [~] Finish required plugin documentation.
|
||||
- [ ] Confirm the project opens cleanly from a fresh checkout, not just the current working share.
|
||||
@@ -313,7 +313,7 @@ Current tooling decisions:
|
||||
- [x] Decide and document VM snapshot cadence before major engine/tool changes.
|
||||
- [x] Define Unraid share backup policy.
|
||||
- [x] Implement Linastorage incremental project backup job with deleted-file retention.
|
||||
- [ ] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex.
|
||||
- [x] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex.
|
||||
- [ ] Add recurring restore-test log for project and VM backups.
|
||||
- [ ] Define GitHub branch protection and review rules.
|
||||
- [ ] Add a build log retention policy.
|
||||
@@ -1415,7 +1415,7 @@ Earliest incomplete foundation items:
|
||||
- [x] Define GitHub/LFS free-tier storage guardrails.
|
||||
- [x] Define backup expectations for NAS and repository.
|
||||
- [x] Implement Linastorage incremental project backup job.
|
||||
- [ ] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex.
|
||||
- [x] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex.
|
||||
- [ ] Create repeatable dedicated server build instructions.
|
||||
- [~] Finish required plugin documentation.
|
||||
- [ ] Confirm project opens cleanly from a fresh checkout.
|
||||
@@ -1433,4 +1433,4 @@ Earliest incomplete foundation items:
|
||||
|
||||
Immediate next item:
|
||||
|
||||
- [ ] Implement quiesced VM backup job for Windows-Builder and Ubuntu-Codex.
|
||||
- [ ] Create repeatable dedicated server build instructions.
|
||||
|
||||
@@ -0,0 +1,91 @@
|
||||
# Agrarian VM Backup Runbook
|
||||
|
||||
Agrarian VM backups run on Unraid `DevBox` and protect the VM definitions,
|
||||
NVRAM, and virtual disks for the development/build machines.
|
||||
|
||||
## VMs
|
||||
|
||||
- `Windows-Builder`
|
||||
- `Ubuntu-Codex`
|
||||
|
||||
## Paths
|
||||
|
||||
- Persistent script: `/boot/config/custom/agrarian-vm-backup.sh`
|
||||
- Runtime helper path: `/usr/local/sbin/agrarian-vm-backup`
|
||||
- Backup root: `/mnt/user/backups/agrarian-game/vms`
|
||||
- Snapshots: `/mnt/user/backups/agrarian-game/vms/snapshots`
|
||||
- Cron definition: `/boot/config/plugins/dynamix/agrarian-vm-backup.cron`
|
||||
|
||||
## Safety Model
|
||||
|
||||
The scheduled job is safe by default. It skips VMs that are currently running,
|
||||
because copying a live VM disk image is not considered a reliable backup.
|
||||
|
||||
For a real VM backup during a maintenance window, run:
|
||||
|
||||
```bash
|
||||
/boot/config/custom/agrarian-vm-backup.sh --shutdown-running
|
||||
```
|
||||
|
||||
If running directly from the persistent flash path fails because `/boot` is
|
||||
mounted noexec, invoke it through bash:
|
||||
|
||||
```bash
|
||||
/bin/bash /boot/config/custom/agrarian-vm-backup.sh --shutdown-running
|
||||
```
|
||||
|
||||
That command gracefully shuts down any running target VM, backs up its XML,
|
||||
NVRAM, Unraid VM/share config, and compressed `qcow2` disk image, then starts
|
||||
the VM again after the backup completes.
|
||||
|
||||
Preview what would happen:
|
||||
|
||||
```bash
|
||||
/bin/bash /boot/config/custom/agrarian-vm-backup.sh --dry-run
|
||||
```
|
||||
|
||||
Back up one VM during maintenance:
|
||||
|
||||
```bash
|
||||
/bin/bash /boot/config/custom/agrarian-vm-backup.sh --shutdown-running --vm Ubuntu-Codex
|
||||
```
|
||||
|
||||
## Schedule
|
||||
|
||||
The installed cron job runs weekly. Without `--shutdown-running`, it only backs
|
||||
up VMs that are already off. This avoids an unexpected Windows build machine or
|
||||
Ubuntu-Codex outage during active work.
|
||||
|
||||
## Retention
|
||||
|
||||
Snapshots older than 120 days are pruned after a successful backup. Manual
|
||||
pre-change backups can be copied or renamed before pruning if they need to be
|
||||
kept longer.
|
||||
|
||||
## Restore Notes
|
||||
|
||||
Each snapshot contains:
|
||||
|
||||
- `MANIFEST.txt`
|
||||
- `SHA256SUMS`
|
||||
- VM XML under `vms/<VM>/xml/`
|
||||
- NVRAM files under `vms/<VM>/nvram/` when present
|
||||
- compressed `qcow2` disk images under `vms/<VM>/disks/`
|
||||
- selected Unraid config under `unraid-config/`
|
||||
|
||||
Verify a snapshot:
|
||||
|
||||
```bash
|
||||
cd /mnt/user/backups/agrarian-game/vms/snapshots/<timestamp>
|
||||
sha256sum -c SHA256SUMS
|
||||
```
|
||||
|
||||
Inspect a backed-up disk:
|
||||
|
||||
```bash
|
||||
qemu-img info vms/Ubuntu-Codex/disks/hdc-vdisk1.img.qcow2
|
||||
```
|
||||
|
||||
Restore should be done to a new VM or a test path first. Do not overwrite a
|
||||
working VM disk until the backup has been verified and the original disk has
|
||||
been preserved.
|
||||
@@ -0,0 +1,3 @@
|
||||
# Weekly safe Agrarian VM backup check.
|
||||
# Running VMs are skipped unless the script is invoked manually with --shutdown-running.
|
||||
15 3 * * 0 /bin/bash /boot/config/custom/agrarian-vm-backup.sh >> /var/log/agrarian-vm-backup.log 2>&1
|
||||
Executable
+300
@@ -0,0 +1,300 @@
|
||||
#!/usr/bin/env bash
|
||||
set -Eeuo pipefail
|
||||
|
||||
umask 077
|
||||
|
||||
VM_NAMES_DEFAULT=("Windows-Builder" "Ubuntu-Codex")
|
||||
BACKUP_ROOT="${AGRARIAN_VM_BACKUP_ROOT:-/mnt/user/backups/agrarian-game/vms}"
|
||||
SNAPSHOT_ROOT="$BACKUP_ROOT/snapshots"
|
||||
LOCK_FILE="${AGRARIAN_VM_BACKUP_LOCK:-/var/lock/agrarian-vm-backup.lock}"
|
||||
SHUTDOWN_TIMEOUT_SECONDS="${AGRARIAN_VM_SHUTDOWN_TIMEOUT_SECONDS:-900}"
|
||||
RETENTION_DAYS="${AGRARIAN_VM_BACKUP_RETENTION_DAYS:-120}"
|
||||
|
||||
ALLOW_SHUTDOWN=0
|
||||
DRY_RUN=0
|
||||
START_AFTER=1
|
||||
VM_NAMES=("${VM_NAMES_DEFAULT[@]}")
|
||||
STARTED_BY_SCRIPT=()
|
||||
BACKED_UP_COUNT=0
|
||||
|
||||
log() {
|
||||
printf '[%s] %s\n' "$(date -Is)" "$*"
|
||||
}
|
||||
|
||||
die() {
|
||||
log "ERROR: $*"
|
||||
exit 1
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage: agrarian_vm_backup_unraid.sh [--shutdown-running] [--no-start-after] [--dry-run] [--vm NAME ...]
|
||||
|
||||
Backs up Agrarian development VMs on Unraid to /mnt/user/backups.
|
||||
|
||||
Default behavior is safe for scheduled runs: running VMs are skipped. Use
|
||||
--shutdown-running during a maintenance window to gracefully stop running VMs,
|
||||
back them up, and start them again after a successful backup.
|
||||
USAGE
|
||||
}
|
||||
|
||||
parse_args() {
|
||||
local custom_vms=()
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--shutdown-running)
|
||||
ALLOW_SHUTDOWN=1
|
||||
;;
|
||||
--no-start-after)
|
||||
START_AFTER=0
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=1
|
||||
;;
|
||||
--vm)
|
||||
shift
|
||||
[[ $# -gt 0 ]] || die "--vm requires a VM name"
|
||||
custom_vms+=("$1")
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
die "Unknown argument: $1"
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [[ "${#custom_vms[@]}" -gt 0 ]]; then
|
||||
VM_NAMES=("${custom_vms[@]}")
|
||||
fi
|
||||
}
|
||||
|
||||
require_tools() {
|
||||
for tool in virsh qemu-img sha256sum find; do
|
||||
command -v "$tool" >/dev/null || die "$tool is required"
|
||||
done
|
||||
}
|
||||
|
||||
vm_state() {
|
||||
virsh domstate "$1" 2>/dev/null | awk 'NR == 1 {print $0}'
|
||||
}
|
||||
|
||||
wait_for_shutdown() {
|
||||
local vm="$1"
|
||||
local waited=0
|
||||
|
||||
while [[ "$(vm_state "$vm")" != "shut off" ]]; do
|
||||
if (( waited >= SHUTDOWN_TIMEOUT_SECONDS )); then
|
||||
die "$vm did not shut down within ${SHUTDOWN_TIMEOUT_SECONDS}s"
|
||||
fi
|
||||
sleep 5
|
||||
waited=$((waited + 5))
|
||||
done
|
||||
}
|
||||
|
||||
shutdown_vm_for_backup() {
|
||||
local vm="$1"
|
||||
local state="$2"
|
||||
|
||||
[[ "$state" == "running" ]] || return 0
|
||||
|
||||
if [[ "$ALLOW_SHUTDOWN" != "1" ]]; then
|
||||
log "Skipping running VM without --shutdown-running: $vm"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log "Gracefully shutting down $vm for backup"
|
||||
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
log "Dry run: would shut down $vm"
|
||||
return 0
|
||||
fi
|
||||
|
||||
virsh shutdown "$vm"
|
||||
wait_for_shutdown "$vm"
|
||||
STARTED_BY_SCRIPT+=("$vm")
|
||||
}
|
||||
|
||||
restart_vms_started_by_script() {
|
||||
[[ "$START_AFTER" == "1" ]] || return 0
|
||||
|
||||
local vm
|
||||
for vm in "${STARTED_BY_SCRIPT[@]}"; do
|
||||
if [[ "$(vm_state "$vm")" == "shut off" ]]; then
|
||||
log "Starting $vm after backup"
|
||||
virsh start "$vm"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
disk_sources_for_vm() {
|
||||
local vm="$1"
|
||||
|
||||
virsh domblklist "$vm" --details \
|
||||
| awk '$1 == "file" && $2 == "disk" && $4 != "-" {print $3 "\t" $4}'
|
||||
}
|
||||
|
||||
copy_nvram_if_present() {
|
||||
local vm="$1"
|
||||
local dest="$2"
|
||||
local nvram
|
||||
|
||||
nvram="$(virsh dumpxml "$vm" | sed -n "s:.*<nvram[^>]*>\\(.*\\)</nvram>.*:\\1:p" | head -n 1)"
|
||||
[[ -n "$nvram" && -f "$nvram" ]] || return 0
|
||||
|
||||
mkdir -p "$dest/nvram"
|
||||
cp -a "$nvram" "$dest/nvram/"
|
||||
}
|
||||
|
||||
backup_vm() {
|
||||
local vm="$1"
|
||||
local snapshot_dir="$2"
|
||||
local state
|
||||
|
||||
state="$(vm_state "$vm")"
|
||||
[[ -n "$state" ]] || die "VM not found: $vm"
|
||||
|
||||
log "Preparing VM backup: $vm ($state)"
|
||||
|
||||
if ! shutdown_vm_for_backup "$vm" "$state"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local vm_dir="$snapshot_dir/vms/$vm"
|
||||
mkdir -p "$vm_dir/disks" "$vm_dir/xml"
|
||||
|
||||
log "Saving VM XML: $vm"
|
||||
virsh dumpxml "$vm" > "$vm_dir/xml/$vm.xml"
|
||||
copy_nvram_if_present "$vm" "$vm_dir"
|
||||
|
||||
local disk_target disk_source disk_dest disk_base
|
||||
while IFS=$'\t' read -r disk_target disk_source; do
|
||||
[[ -n "$disk_source" ]] || continue
|
||||
[[ -f "$disk_source" ]] || die "Disk source not found for $vm: $disk_source"
|
||||
|
||||
disk_base="$(basename "$disk_source")"
|
||||
disk_dest="$vm_dir/disks/${disk_target}-${disk_base}.qcow2"
|
||||
|
||||
log "Converting disk for $vm: $disk_source -> $disk_dest"
|
||||
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
log "Dry run: would qemu-img convert $disk_source"
|
||||
continue
|
||||
fi
|
||||
|
||||
qemu-img convert -p -O qcow2 -c "$disk_source" "$disk_dest"
|
||||
qemu-img info "$disk_dest" > "$disk_dest.info.txt"
|
||||
done < <(disk_sources_for_vm "$vm")
|
||||
|
||||
BACKED_UP_COUNT=$((BACKED_UP_COUNT + 1))
|
||||
}
|
||||
|
||||
write_manifest() {
|
||||
local snapshot_dir="$1"
|
||||
local timestamp="$2"
|
||||
|
||||
{
|
||||
echo "backup_timestamp=$timestamp"
|
||||
echo "host=$(hostname)"
|
||||
echo "backup_root=$BACKUP_ROOT"
|
||||
echo "allow_shutdown=$ALLOW_SHUTDOWN"
|
||||
echo "start_after=$START_AFTER"
|
||||
echo
|
||||
echo "[vms]"
|
||||
printf '%s\n' "${VM_NAMES[@]}"
|
||||
echo
|
||||
echo "[virsh_list]"
|
||||
virsh list --all
|
||||
echo
|
||||
echo "[disk_usage]"
|
||||
df -h /mnt/user /mnt/cache 2>/dev/null || true
|
||||
} > "$snapshot_dir/MANIFEST.txt"
|
||||
|
||||
mkdir -p "$snapshot_dir/unraid-config"
|
||||
cp -a /boot/config/domain.cfg "$snapshot_dir/unraid-config/" 2>/dev/null || true
|
||||
cp -a /boot/config/vfio-pci.cfg "$snapshot_dir/unraid-config/" 2>/dev/null || true
|
||||
cp -a /boot/config/shares "$snapshot_dir/unraid-config/" 2>/dev/null || true
|
||||
|
||||
(
|
||||
cd "$snapshot_dir"
|
||||
find . -type f ! -name SHA256SUMS -print0 | sort -z | xargs -0 sha256sum
|
||||
) > "$snapshot_dir/SHA256SUMS"
|
||||
}
|
||||
|
||||
verify_snapshot() {
|
||||
local snapshot_dir="$1"
|
||||
|
||||
[[ "$DRY_RUN" == "1" ]] && return 0
|
||||
|
||||
(
|
||||
cd "$snapshot_dir"
|
||||
sha256sum -c SHA256SUMS >/dev/null
|
||||
)
|
||||
}
|
||||
|
||||
prune_old_snapshots() {
|
||||
[[ "$DRY_RUN" == "1" ]] && return 0
|
||||
[[ -d "$SNAPSHOT_ROOT" ]] || return 0
|
||||
|
||||
find "$SNAPSHOT_ROOT" -mindepth 1 -maxdepth 1 -type d -mtime "+$RETENTION_DAYS" -print0 \
|
||||
| while IFS= read -r -d '' old; do
|
||||
log "Pruning old VM backup: $old"
|
||||
rm -rf -- "$old"
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
require_tools
|
||||
|
||||
exec 9>"$LOCK_FILE"
|
||||
flock -n 9 || {
|
||||
log "Another Agrarian VM backup is already running; skipping"
|
||||
exit 0
|
||||
}
|
||||
|
||||
trap restart_vms_started_by_script EXIT
|
||||
|
||||
local timestamp snapshot_dir incomplete_dir final_dir
|
||||
timestamp="$(date +'%Y%m%d-%H%M%S')"
|
||||
incomplete_dir="$SNAPSHOT_ROOT/.incomplete-$timestamp"
|
||||
final_dir="$SNAPSHOT_ROOT/$timestamp"
|
||||
|
||||
mkdir -p "$SNAPSHOT_ROOT"
|
||||
rm -rf -- "$incomplete_dir" "$final_dir"
|
||||
mkdir -p "$incomplete_dir"
|
||||
|
||||
log "Starting Agrarian VM backup: $timestamp"
|
||||
|
||||
local vm
|
||||
for vm in "${VM_NAMES[@]}"; do
|
||||
backup_vm "$vm" "$incomplete_dir"
|
||||
done
|
||||
|
||||
if [[ "$BACKED_UP_COUNT" == "0" ]]; then
|
||||
log "No VMs were eligible for backup; no snapshot published"
|
||||
rm -rf -- "$incomplete_dir"
|
||||
return 0
|
||||
fi
|
||||
|
||||
write_manifest "$incomplete_dir" "$timestamp"
|
||||
verify_snapshot "$incomplete_dir"
|
||||
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
log "Dry run complete; no snapshot published"
|
||||
rm -rf -- "$incomplete_dir"
|
||||
return 0
|
||||
fi
|
||||
|
||||
mv "$incomplete_dir" "$final_dir"
|
||||
printf '%s\n' "$timestamp" > "$BACKUP_ROOT/LATEST.txt"
|
||||
prune_old_snapshots
|
||||
|
||||
log "Agrarian VM backup completed: $final_dir"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user