nixpkgs/nixos/modules/system/boot/stage-1-init.sh

639 lines
18 KiB
Bash
Raw Normal View History

#! @shell@
targetRoot=/mnt-root
console=tty1
2021-01-03 09:09:30 +00:00
verbose="@verbose@"
info() {
if [[ -n "$verbose" ]]; then
echo "$@"
fi
}
2016-07-06 19:57:14 +00:00
extraUtils="@extraUtils@"
export LD_LIBRARY_PATH=@extraUtils@/lib
export PATH=@extraUtils@/bin
ln -s @extraUtils@/bin /bin
# Copy the secrets to their needed location
if [ -d "@extraUtils@/secrets" ]; then
for secret in $(cd "@extraUtils@/secrets"; find . -type f); do
mkdir -p $(dirname "/$secret")
ln -s "@extraUtils@/secrets/$secret" "$secret"
done
fi
# Stop LVM complaining about fd3
export LVM_SUPPRESS_FD_WARNINGS=true
fail() {
if [ -n "$panicOnFail" ]; then exit 1; fi
@preFailCommands@
# If starting stage 2 failed, allow the user to repair the problem
# in an interactive shell.
cat <<EOF
2014-03-07 18:39:55 +00:00
An error occurred in stage 1 of the boot process, which must mount the
root filesystem on \`$targetRoot' and then start stage 2. Press one
of the following keys:
EOF
if [ -n "$allowShell" ]; then cat <<EOF
i) to launch an interactive shell
f) to start an interactive shell having pid 1 (needed if you want to
start stage 2's init manually)
EOF
fi
cat <<EOF
r) to reboot immediately
*) to ignore the error and continue
EOF
read -n 1 reply
if [ -n "$allowShell" -a "$reply" = f ]; then
exec setsid @shell@ -c "exec @shell@ < /dev/$console >/dev/$console 2>/dev/$console"
elif [ -n "$allowShell" -a "$reply" = i ]; then
echo "Starting interactive shell..."
setsid @shell@ -c "exec @shell@ < /dev/$console >/dev/$console 2>/dev/$console" || fail
elif [ "$reply" = r ]; then
echo "Rebooting..."
reboot -f
else
2021-01-03 09:09:30 +00:00
info "Continuing..."
fi
}
trap 'fail' 0
# Print a greeting.
2021-01-03 09:09:30 +00:00
info
info "<<< NixOS Stage 1 >>>"
info
# Make several required directories.
mkdir -p /etc/udev
touch /etc/fstab # to shut up mount
ln -s /proc/mounts /etc/mtab # to shut up mke2fs
2014-12-19 13:38:33 +00:00
touch /etc/udev/hwdb.bin # to shut up udev
touch /etc/initrd-release
# Function for waiting a device to appear.
waitDevice() {
local device="$1"
# USB storage devices tend to appear with some delay. It would be
# great if we had a way to synchronously wait for them, but
# alas... So just wait for a few seconds for the device to
# appear.
if test ! -e $device; then
echo -n "waiting for device $device to appear..."
try=20
while [ $try -gt 0 ]; do
sleep 1
# also re-try lvm activation now that new block devices might have appeared
lvm vgchange -ay
# and tell udev to create nodes for the new LVs
udevadm trigger --action=add
if test -e $device; then break; fi
echo -n "."
try=$((try - 1))
done
echo
[ $try -ne 0 ]
fi
}
# Mount special file systems.
specialMount() {
local device="$1"
local mountPoint="$2"
local options="$3"
local fsType="$4"
mkdir -m 0755 -p "$mountPoint"
mount -n -t "$fsType" -o "$options" "$device" "$mountPoint"
}
source @earlyMountScript@
# Log the script output to /dev/kmsg or /run/log/stage-1-init.log.
mkdir -p /tmp
mkfifo /tmp/stage-1-init.log.fifo
logOutFd=8 && logErrFd=9
eval "exec $logOutFd>&1 $logErrFd>&2"
if test -w /dev/kmsg; then
tee -i < /tmp/stage-1-init.log.fifo /proc/self/fd/"$logOutFd" | while read -r line; do
if test -n "$line"; then
echo "<7>stage-1-init: [$(date)] $line" > /dev/kmsg
fi
done &
else
mkdir -p /run/log
tee -i < /tmp/stage-1-init.log.fifo /run/log/stage-1-init.log &
fi
exec > /tmp/stage-1-init.log.fifo 2>&1
# Process the kernel command line.
export stage2Init=/init
for o in $(cat /proc/cmdline); do
case $o in
console=*)
set -- $(IFS==; echo $o)
params=$2
set -- $(IFS=,; echo $params)
console=$1
;;
init=*)
set -- $(IFS==; echo $o)
stage2Init=$2
;;
boot.persistence=*)
set -- $(IFS==; echo $o)
persistence=$2
;;
boot.persistence.opt=*)
set -- $(IFS==; echo $o)
persistence_opt=$2
;;
boot.trace|debugtrace)
# Show each command.
set -x
;;
boot.shell_on_fail)
allowShell=1
;;
boot.debug1|debug1) # stop right away
allowShell=1
fail
;;
boot.debug1devices) # stop after loading modules and creating device nodes
allowShell=1
debug1devices=1
;;
boot.debug1mounts) # stop after mounting file systems
allowShell=1
debug1mounts=1
;;
boot.panic_on_fail|stage1panic=1)
panicOnFail=1
;;
root=*)
# If a root device is specified on the kernel command
# line, make it available through the symlink /dev/root.
# Recognise LABEL= and UUID= to support UNetbootin.
set -- $(IFS==; echo $o)
if [ $2 = "LABEL" ]; then
root="/dev/disk/by-label/$3"
elif [ $2 = "UUID" ]; then
root="/dev/disk/by-uuid/$3"
else
root=$2
fi
ln -s "$root" /dev/root
;;
2017-04-18 11:45:30 +00:00
copytoram)
copytoram=1
;;
findiso=*)
# if an iso name is supplied, try to find the device where
# the iso resides on
set -- $(IFS==; echo $o)
isoPath=$2
;;
esac
done
# Set hostid before modules are loaded.
# This is needed by the spl/zfs modules.
@setHostId@
# Load the required kernel modules.
mkdir -p /lib
ln -s @modulesClosure@/lib/modules /lib/modules
ln -s @modulesClosure@/lib/firmware /lib/firmware
echo @extraUtils@/bin/modprobe > /proc/sys/kernel/modprobe
for i in @kernelModules@; do
2021-01-03 09:09:30 +00:00
info "loading module $(basename $i)..."
modprobe $i
done
# Create device nodes in /dev.
@preDeviceCommands@
2021-01-03 09:09:30 +00:00
info "running udev..."
ln -sfn /proc/self/fd /dev/fd
ln -sfn /proc/self/fd/0 /dev/stdin
ln -sfn /proc/self/fd/1 /dev/stdout
ln -sfn /proc/self/fd/2 /dev/stderr
mkdir -p /etc/systemd
ln -sfn @linkUnits@ /etc/systemd/network
mkdir -p /etc/udev
ln -sfn @udevRules@ /etc/udev/rules.d
mkdir -p /dev/.mdadm
systemd-udevd --daemon
udevadm trigger --action=add
udevadm settle
# XXX: Use case usb->lvm will still fail, usb->luks->lvm is covered
@preLVMCommands@
2021-01-03 09:09:30 +00:00
info "starting device mapper and LVM..."
lvm vgchange -ay
if test -n "$debug1devices"; then fail; fi
@postDeviceCommands@
# Return true if the machine is on AC power, or if we can't determine
# whether it's on AC power.
onACPower() {
! test -d "/proc/acpi/battery" ||
! ls /proc/acpi/battery/BAT[0-9]* > /dev/null 2>&1 ||
! cat /proc/acpi/battery/BAT*/state | grep "^charging state" | grep -q "discharg"
}
# Check the specified file system, if appropriate.
checkFS() {
local device="$1"
local fsType="$2"
# Only check block devices.
if [ ! -b "$device" ]; then return 0; fi
# Don't check ROM filesystems.
if [ "$fsType" = iso9660 -o "$fsType" = udf ]; then return 0; fi
# Don't check resilient COWs as they validate the fs structures at mount time
if [ "$fsType" = btrfs -o "$fsType" = zfs -o "$fsType" = bcachefs ]; then return 0; fi
2017-08-31 10:24:48 +00:00
# Skip fsck for nilfs2 - not needed by design and no fsck tool for this filesystem.
if [ "$fsType" = nilfs2 ]; then return 0; fi
# Skip fsck for inherently readonly filesystems.
if [ "$fsType" = squashfs ]; then return 0; fi
# If we couldn't figure out the FS type, then skip fsck.
if [ "$fsType" = auto ]; then
echo 'cannot check filesystem with type "auto"!'
return 0
fi
# Device might be already mounted manually
# e.g. NBD-device or the host filesystem of the file which contains encrypted root fs
if mount | grep -q "^$device on "; then
echo "skip checking already mounted $device"
return 0
fi
# Optionally, skip fsck on journaling filesystems. This option is
# a hack - it's mostly because e2fsck on ext3 takes much longer to
# recover the journal than the ext3 implementation in the kernel
# does (minutes versus seconds).
if test -z "@checkJournalingFS@" -a \
\( "$fsType" = ext3 -o "$fsType" = ext4 -o "$fsType" = reiserfs \
-o "$fsType" = xfs -o "$fsType" = jfs -o "$fsType" = f2fs \)
then
return 0
fi
# Don't run `fsck' if the machine is on battery power. !!! Is
# this a good idea?
if ! onACPower; then
echo "on battery power, so no \`fsck' will be performed on \`$device'"
return 0
fi
echo "checking $device..."
fsckFlags=
if test "$fsType" != "btrfs"; then
fsckFlags="-V -a"
fi
fsck $fsckFlags "$device"
fsckResult=$?
if test $(($fsckResult | 2)) = $fsckResult; then
echo "fsck finished, rebooting..."
sleep 3
reboot -f
fi
if test $(($fsckResult | 4)) = $fsckResult; then
echo "$device has unrepaired errors, please fix them manually."
fail
fi
if test $fsckResult -ge 8; then
echo "fsck on $device failed."
fail
fi
return 0
}
# Function for mounting a file system.
mountFS() {
local device="$1"
local mountPoint="$2"
local options="$3"
local fsType="$4"
if [ "$fsType" = auto ]; then
fsType=$(blkid -o value -s TYPE "$device")
if [ -z "$fsType" ]; then fsType=auto; fi
fi
# Filter out x- options, which busybox doesn't do yet.
local optionsFiltered="$(IFS=,; for i in $options; do if [ "${i:0:2}" != "x-" ]; then echo -n $i,; fi; done)"
# Prefix (lower|upper|work)dir with /mnt-root (overlayfs)
local optionsPrefixed="$( echo "$optionsFiltered" | sed -E 's#\<(lowerdir|upperdir|workdir)=#\1=/mnt-root#g' )"
echo "$device /mnt-root$mountPoint $fsType $optionsPrefixed" >> /etc/fstab
checkFS "$device" "$fsType"
# Optionally resize the filesystem.
case $options in
*x-nixos.autoresize*)
if [ "$fsType" = ext2 -o "$fsType" = ext3 -o "$fsType" = ext4 ]; then
stage-1: modprobe ext{2,3,4} before resizing I noticed booting a system with an ext4 root which expanded to 5T took quite a long time (12 minutes in some cases, 43(!) in others.) I changed stage-1 to run `resize2fs -d 62` for extra debug output and timing information. It revealed the adjust_superblock step taking almost all of the time: [Fri Oct 30 11:10:15 UTC 2020] zero_high_bits_in_metadata: Memory used: 132k/0k (63k/70k), time: 0.00/ 0.00/ 0.00 [Fri Oct 30 11:21:09 UTC 2020] adjust_superblock: Memory used: 396k/4556k (295k/102k), time: 654.21/ 0.59/ 5.13 but when I ran resize2fs on a disk with the identical content growing to the identical target size, it would only take about 30 seconds. I looked at what happened between those two steps in the fast case with strace and found: ``` 235 getrusage(RUSAGE_SELF, {ru_utime={tv_sec=0, tv_usec=1795}, ru_stime={tv_sec=0, tv_usec=3590}, ...}) = 0 236 write(1, "zero_high_bits_in_metadata: Memo"..., 84zero_high_bits_in_metadata: Memory used: 132k/0k (72k/61k), time: 0.00/ 0.00/ 0.00 237 ) = 84 238 gettimeofday({tv_sec=1604061278, tv_usec=480147}, NULL) = 0 239 getrusage(RUSAGE_SELF, {ru_utime={tv_sec=0, tv_usec=1802}, ru_stime={tv_sec=0, tv_usec=3603}, ...}) = 0 240 gettimeofday({tv_sec=1604061278, tv_usec=480192}, NULL) = 0 241 mmap(NULL, 2564096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa3c7355000 242 access("/sys/fs/ext4/features/lazy_itable_init", F_OK) = 0 243 brk(0xf85000) = 0xf85000 244 brk(0xfa6000) = 0xfa6000 245 gettimeofday({tv_sec=1604061278, tv_usec=538828}, NULL) = 0 246 getrusage(RUSAGE_SELF, {ru_utime={tv_sec=0, tv_usec=58720}, ru_stime={tv_sec=0, tv_usec=3603}, ...}) = 0 247 write(1, "adjust_superblock: Memory used: "..., 79adjust_superblock: Memory used: 396k/2504k (305k/92k), time: 0.06/ 0.06/ 0.00 248 ) = 79 249 gettimeofday({tv_sec=1604061278, tv_usec=539119}, NULL) = 0 250 getrusage(RUSAGE_SELF, {ru_utime={tv_sec=0, tv_usec=58812}, ru_stime={tv_sec=0, tv_usec=3603}, ...}) = 0 251 gettimeofday({tv_sec=1604061279, tv_usec=939}, NULL) = 0 252 getrusage(RUSAGE_SELF, {ru_utime={tv_sec=0, tv_usec=520411}, ru_stime={tv_sec=0, tv_usec=3603}, ...}) = 0 253 write(1, "fix_uninit_block_bitmaps 2: Memo"..., 88fix_uninit_block_bitmaps 2: Memory used: 396k/2504k (305k/92k), time: 0.46/ 0.46/ 0.00 254 ) = 88 ``` In particular the access to /sys/fs seemed interesting. Looking at the source of resize2fs: ``` [root@ip-172-31-22-182:~/e2fsprogs-1.45.5]# rg -B2 -A1 /sys/fs/ext4/features/lazy_itable_init . ./resize/resize2fs.c 923- if (getenv("RESIZE2FS_FORCE_LAZY_ITABLE_INIT") || 924- (!getenv("RESIZE2FS_FORCE_ITABLE_INIT") && 925: access("/sys/fs/ext4/features/lazy_itable_init", F_OK) == 0)) 926- lazy_itable_init = 1; ``` I confirmed /sys is mounted, and then found a bug suggesting the ext4 module is maybe not loaded: https://bugzilla.redhat.com/show_bug.cgi?id=1071909 My home server doesn't have ext4 loaded and had 3T to play with, so I tried (and succeeded with) replicating the issue locally: ``` [root@kif:/scratch]# lsmod | grep -i ext [root@kif:/scratch]# zfs create -V 3G rpool/scratch/ext4 [root@kif:/scratch]# time mkfs.ext4 /dev/zvol/rpool/scratch/ext4 mke2fs 1.45.5 (07-Jan-2020) Discarding device blocks: done Creating filesystem with 786432 4k blocks and 196608 inodes Filesystem UUID: 560a4a8f-93dc-40cc-97a5-f10049bf801f Superblock backups stored on blocks: 32768, 98304, 163840, 229376, 294912 Allocating group tables: done Writing inode tables: done Creating journal (16384 blocks): done Writing superblocks and filesystem accounting information: done real 0m2.261s user 0m0.000s sys 0m0.025s [root@kif:/scratch]# zfs set volsize=3T rpool/scratch/ext4 [root@kif:/scratch]# time resize2fs -d 62 /dev/zvol/rpool/scratch/ext4 resize2fs 1.45.5 (07-Jan-2020) fs has 11 inodes, 1 groups required. fs requires 16390 data blocks. With 1 group(s), we have 22234 blocks available. Last group's overhead is 10534 Need 16390 data blocks in last group Final size of last group is 26924 Estimated blocks needed: 26924 Extents safety margin: 49 Resizing the filesystem on /dev/zvol/rpool/scratch/ext4 to 805306368 (4k) blocks. read_bitmaps: Memory used: 132k/0k (63k/70k), time: 0.00/ 0.00/ 0.00 read_bitmaps: I/O read: 1MB, write: 0MB, rate: 3802.28MB/s fix_uninit_block_bitmaps 1: Memory used: 132k/0k (63k/70k), time: 0.00/ 0.00/ 0.00 resize_group_descriptors: Memory used: 132k/0k (68k/65k), time: 0.00/ 0.00/ 0.00 move_bg_metadata: Memory used: 132k/0k (68k/65k), time: 0.00/ 0.00/ 0.00 zero_high_bits_in_metadata: Memory used: 132k/0k (68k/65k), time: 0.00/ 0.00/ 0.00 ``` here it got stuck for quite some time ... straceing this 20 minutes in revealed this in a tight loop: ``` getuid() = 0 geteuid() = 0 getgid() = 0 getegid() = 0 prctl(PR_GET_DUMPABLE) = 1 (SUID_DUMP_USER) fallocate(3, FALLOC_FL_ZERO_RANGE, 2222649901056, 2097152) = 0 fsync(3) = 0 ``` it finally ended 43(!) minutes later: ``` adjust_superblock: Memory used: 264k/3592k (210k/55k), time: 2554.03/ 0.16/15.07 fix_uninit_block_bitmaps 2: Memory used: 264k/3592k (210k/55k), time: 0.16/ 0.16/ 0.00 blocks_to_move: Memory used: 264k/3592k (211k/54k), time: 0.00/ 0.00/ 0.00 Number of free blocks: 755396/780023556, Needed: 0 block_mover: Memory used: 264k/3592k (216k/49k), time: 0.05/ 0.01/ 0.00 block_mover: I/O read: 1MB, write: 0MB, rate: 18.68MB/s inode_scan_and_fix: Memory used: 264k/3592k (216k/49k), time: 0.00/ 0.00/ 0.00 inode_ref_fix: Memory used: 264k/3592k (216k/49k), time: 0.00/ 0.00/ 0.00 move_itables: Memory used: 264k/3592k (216k/49k), time: 0.00/ 0.00/ 0.00 calculate_summary_stats: Memory used: 264k/3592k (216k/49k), time: 16.35/16.35/ 0.00 fix_resize_inode: Memory used: 264k/3592k (222k/43k), time: 0.04/ 0.00/ 0.00 fix_resize_inode: I/O read: 1MB, write: 0MB, rate: 22.80MB/s fix_sb_journal_backup: Memory used: 264k/3592k (222k/43k), time: 0.00/ 0.00/ 0.00 overall resize2fs: Memory used: 264k/3592k (222k/43k), time: 2570.90/16.68/15.07 overall resize2fs: I/O read: 1MB, write: 1MB, rate: 0.00MB/s The filesystem on /dev/zvol/rpool/scratch/ext4 is now 805306368 (4k) blocks long. real 43m1.943s user 0m16.761s sys 0m15.069s ``` I then cleaned up and recreated the zvol, loaded the ext4 module, created the ext4 fs, resized the volume, and resize2fs'd and it went quite quickly: ``` [root@kif:/scratch]# zfs destroy rpool/scratch/ext4 [root@kif:/scratch]# zfs create -V 3G rpool/scratch/ext4 [root@kif:/scratch]# modprobe ext4 [root@kif:/scratch]# time resize2fs -d 62 /dev/zvol/rpool/scratch/ext4 [root@kif:/scratch]# time mkfs.ext4 /dev/zvol/rpool/scratch/ext4 mke2fs 1.45.5 (07-Jan-2020) Discarding device blocks: done Creating filesystem with 786432 4k blocks and 196608 inodes Filesystem UUID: 5b415f2f-a8c4-4ba0-ac1d-78860de77610 Superblock backups stored on blocks: 32768, 98304, 163840, 229376, 294912 Allocating group tables: done Writing inode tables: done Creating journal (16384 blocks): done Writing superblocks and filesystem accounting information: done real 0m1.013s user 0m0.001s sys 0m0.023s [root@kif:/scratch]# zfs set volsize=3T rpool/scratch/ext4 [root@kif:/scratch]# time resize2fs -d 62 /dev/zvol/rpool/scratch/ext4 resize2fs 1.45.5 (07-Jan-2020) fs has 11 inodes, 1 groups required. fs requires 16390 data blocks. With 1 group(s), we have 22234 blocks available. Last group's overhead is 10534 Need 16390 data blocks in last group Final size of last group is 26924 Estimated blocks needed: 26924 Extents safety margin: 49 Resizing the filesystem on /dev/zvol/rpool/scratch/ext4 to 805306368 (4k) blocks. read_bitmaps: Memory used: 132k/0k (63k/70k), time: 0.00/ 0.00/ 0.00 read_bitmaps: I/O read: 1MB, write: 0MB, rate: 3389.83MB/s fix_uninit_block_bitmaps 1: Memory used: 132k/0k (63k/70k), time: 0.00/ 0.00/ 0.00 resize_group_descriptors: Memory used: 132k/0k (68k/65k), time: 0.00/ 0.00/ 0.00 move_bg_metadata: Memory used: 132k/0k (68k/65k), time: 0.00/ 0.00/ 0.00 zero_high_bits_in_metadata: Memory used: 132k/0k (68k/65k), time: 0.00/ 0.00/ 0.00 adjust_superblock: Memory used: 264k/1540k (210k/55k), time: 0.02/ 0.02/ 0.00 fix_uninit_block_bitmaps 2: Memory used: 264k/1540k (210k/55k), time: 0.15/ 0.15/ 0.00 blocks_to_move: Memory used: 264k/1540k (211k/54k), time: 0.00/ 0.00/ 0.00 Number of free blocks: 755396/780023556, Needed: 0 block_mover: Memory used: 264k/3592k (216k/49k), time: 0.01/ 0.01/ 0.00 block_mover: I/O read: 1MB, write: 0MB, rate: 157.11MB/s inode_scan_and_fix: Memory used: 264k/3592k (216k/49k), time: 0.00/ 0.00/ 0.00 inode_ref_fix: Memory used: 264k/3592k (216k/49k), time: 0.00/ 0.00/ 0.00 move_itables: Memory used: 264k/3592k (216k/49k), time: 0.00/ 0.00/ 0.00 calculate_summary_stats: Memory used: 264k/3592k (216k/49k), time: 16.20/16.20/ 0.00 fix_resize_inode: Memory used: 264k/3592k (222k/43k), time: 0.00/ 0.00/ 0.00 fix_resize_inode: I/O read: 1MB, write: 0MB, rate: 5319.15MB/s fix_sb_journal_backup: Memory used: 264k/3592k (222k/43k), time: 0.00/ 0.00/ 0.00 overall resize2fs: Memory used: 264k/3592k (222k/43k), time: 16.45/16.38/ 0.00 overall resize2fs: I/O read: 1MB, write: 1MB, rate: 0.06MB/s The filesystem on /dev/zvol/rpool/scratch/ext4 is now 805306368 (4k) blocks long. real 0m17.908s user 0m16.386s sys 0m0.079s ``` Success!
2020-10-30 15:20:00 +00:00
modprobe "$fsType"
echo "resizing $device..."
e2fsck -fp "$device"
resize2fs "$device"
2018-06-18 22:59:08 +00:00
elif [ "$fsType" = f2fs ]; then
echo "resizing $device..."
fsck.f2fs -fp "$device"
resize.f2fs "$device"
fi
;;
esac
# Create backing directories for overlayfs
if [ "$fsType" = overlay ]; then
for i in upper work; do
dir="$( echo "$optionsPrefixed" | grep -o "${i}dir=[^,]*" )"
mkdir -m 0700 -p "${dir##*=}"
done
fi
2021-01-03 09:09:30 +00:00
info "mounting $device on $mountPoint..."
mkdir -p "/mnt-root$mountPoint"
# For ZFS and CIFS mounts, retry a few times before giving up.
# We do this for ZFS as a workaround for issue NixOS/nixpkgs#25383.
local n=0
while true; do
mount "/mnt-root$mountPoint" && break
if [ \( "$fsType" != cifs -a "$fsType" != zfs \) -o "$n" -ge 10 ]; then fail; break; fi
echo "retrying..."
sleep 1
n=$((n + 1))
done
2016-08-22 00:15:13 +00:00
[ "$mountPoint" == "/" ] &&
[ -f "/mnt-root/etc/NIXOS_LUSTRATE" ] &&
lustrateRoot "/mnt-root"
true
}
2016-08-22 00:15:13 +00:00
lustrateRoot () {
local root="$1"
echo
echo -e "\e[1;33m<<< NixOS is now lustrating the root filesystem (cruft goes to /old-root) >>>\e[0m"
echo
mkdir -m 0755 -p "$root/old-root.tmp"
echo
echo "Moving impurities out of the way:"
for d in "$root"/*
do
[ "$d" == "$root/nix" ] && continue
[ "$d" == "$root/boot" ] && continue # Don't render the system unbootable
[ "$d" == "$root/old-root.tmp" ] && continue
mv -v "$d" "$root/old-root.tmp"
done
# Use .tmp to make sure subsequent invokations don't clash
mv -v "$root/old-root.tmp" "$root/old-root"
mkdir -m 0755 -p "$root/etc"
touch "$root/etc/NIXOS"
exec 4< "$root/old-root/etc/NIXOS_LUSTRATE"
echo
echo "Restoring selected impurities:"
while read -u 4 keeper; do
dirname="$(dirname "$keeper")"
mkdir -m 0755 -p "$root/$dirname"
cp -av "$root/old-root/$keeper" "$root/$keeper"
done
exec 4>&-
}
if test -e /sys/power/resume -a -e /sys/power/disk; then
if test -n "@resumeDevice@" && waitDevice "@resumeDevice@"; then
resumeDev="@resumeDevice@"
resumeInfo="$(udevadm info -q property "$resumeDev" )"
else
for sd in @resumeDevices@; do
# Try to detect resume device. According to Ubuntu bug:
# https://bugs.launchpad.net/ubuntu/+source/pm-utils/+bug/923326/comments/1
# when there are multiple swap devices, we can't know where the hibernate
# image will reside. We can check all of them for swsuspend blkid.
if waitDevice "$sd"; then
resumeInfo="$(udevadm info -q property "$sd")"
if [ "$(echo "$resumeInfo" | sed -n 's/^ID_FS_TYPE=//p')" = "swsuspend" ]; then
resumeDev="$sd"
break
fi
fi
done
fi
if test -n "$resumeDev"; then
resumeMajor="$(echo "$resumeInfo" | sed -n 's/^MAJOR=//p')"
resumeMinor="$(echo "$resumeInfo" | sed -n 's/^MINOR=//p')"
echo "$resumeMajor:$resumeMinor" > /sys/power/resume 2> /dev/null || echo "failed to resume..."
fi
fi
# If we have a path to an iso file, find the iso and link it to /dev/root
if [ -n "$isoPath" ]; then
mkdir -p /findiso
for delay in 5 10; do
blkid | while read -r line; do
device=$(echo "$line" | sed 's/:.*//')
type=$(echo "$line" | sed 's/.*TYPE="\([^"]*\)".*/\1/')
mount -t "$type" "$device" /findiso
if [ -e "/findiso$isoPath" ]; then
ln -sf "/findiso$isoPath" /dev/root
break 2
else
umount /findiso
fi
done
sleep "$delay"
done
fi
# Try to find and mount the root device.
mkdir -p $targetRoot
exec 3< @fsInfo@
while read -u 3 mountPoint; do
read -u 3 device
read -u 3 fsType
read -u 3 options
# !!! Really quick hack to support bind mounts, i.e., where the
# "device" should be taken relative to /mnt-root, not /. Assume
# that every device that starts with / but doesn't start with /dev
# is a bind mount.
pseudoDevice=
case $device in
/dev/*)
;;
//*)
# Don't touch SMB/CIFS paths.
pseudoDevice=1
;;
/*)
device=/mnt-root$device
;;
*)
# Not an absolute path; assume that it's a pseudo-device
# like an NFS path (e.g. "server:/path").
pseudoDevice=1
;;
esac
if test -z "$pseudoDevice" && ! waitDevice "$device"; then
# If it doesn't appear, try to mount it anyway (and
# probably fail). This is a fallback for non-device "devices"
# that we don't properly recognise.
echo "Timed out waiting for device $device, trying to mount anyway."
fi
# Wait once more for the udev queue to empty, just in case it's
# doing something with $device right now.
udevadm settle
2017-04-18 11:45:30 +00:00
# If copytoram is enabled: skip mounting the ISO and copy its content to a tmpfs.
if [ -n "$copytoram" ] && [ "$device" = /dev/root ] && [ "$mountPoint" = /iso ]; then
fsType=$(blkid -o value -s TYPE "$device")
fsSize=$(blockdev --getsize64 "$device" || stat -Lc '%s' "$device")
2017-04-18 11:45:30 +00:00
mkdir -p /tmp-iso
mount -t "$fsType" /dev/root /tmp-iso
mountFS tmpfs /iso size="$fsSize" tmpfs
cp -r /tmp-iso/* /mnt-root/iso/
umount /tmp-iso
rmdir /tmp-iso
continue
fi
if [ "$mountPoint" = / ] && [ "$device" = tmpfs ] && [ ! -z "$persistence" ]; then
echo persistence...
waitDevice "$persistence"
echo enabling persistence...
mountFS "$persistence" "$mountPoint" "$persistence_opt" "auto"
continue
fi
mountFS "$device" "$mountPoint" "$options" "$fsType"
done
exec 3>&-
@postMountCommands@
# Emit a udev rule for /dev/root to prevent systemd from complaining.
2014-11-11 22:48:31 +00:00
if [ -e /mnt-root/iso ]; then
eval $(udevadm info --export --export-prefix=ROOT_ --device-id-of-file=/mnt-root/iso)
2014-11-11 22:48:31 +00:00
else
eval $(udevadm info --export --export-prefix=ROOT_ --device-id-of-file=$targetRoot)
2014-11-11 22:48:31 +00:00
fi
if [ "$ROOT_MAJOR" -a "$ROOT_MINOR" -a "$ROOT_MAJOR" != 0 ]; then
mkdir -p /run/udev/rules.d
echo 'ACTION=="add|change", SUBSYSTEM=="block", ENV{MAJOR}=="'$ROOT_MAJOR'", ENV{MINOR}=="'$ROOT_MINOR'", SYMLINK+="root"' > /run/udev/rules.d/61-dev-root-link.rules
fi
# Stop udevd.
udevadm control --exit
# Reset the logging file descriptors.
# Do this just before pkill, which will kill the tee process.
exec 1>&$logOutFd 2>&$logErrFd
eval "exec $logOutFd>&- $logErrFd>&-"
# Kill any remaining processes, just to be sure we're not taking any
# with us into stage 2. But keep storage daemons like unionfs-fuse.
nixos/stage-1: Don't kill kernel threads Unfortunately, pkill doesn't distinguish between kernel and user space processes, so we need to make sure we don't accidentally kill kernel threads. Normally, a kernel thread ignores all signals, but there are a few that do. A quick grep on the kernel source tree (as of kernel 4.6.0) shows the following source files which use allow_signal(): drivers/isdn/mISDN/l1oip_core.c drivers/md/md.c drivers/misc/mic/cosm/cosm_scif_server.c drivers/misc/mic/cosm_client/cosm_scif_client.c drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c drivers/staging/rtl8188eu/core/rtw_cmd.c drivers/staging/rtl8712/rtl8712_cmd.c drivers/target/iscsi/iscsi_target.c drivers/target/iscsi/iscsi_target_login.c drivers/target/iscsi/iscsi_target_nego.c drivers/usb/atm/usbatm.c drivers/usb/gadget/function/f_mass_storage.c fs/jffs2/background.c fs/lockd/clntlock.c fs/lockd/svc.c fs/nfs/nfs4state.c fs/nfsd/nfssvc.c While not all of these are necessarily kthreads and some functionality may still be unimpeded, it's still quite harmful and can cause unexpected side-effects, especially because some of these kthreads are storage-related (which we obviously don't want to kill during bootup). During discussion at #15226, @dezgeg suggested the following implementation: for pid in $(pgrep -v -f '@'); do if [ "$(cat /proc/$pid/cmdline)" != "" ]; then kill -9 "$pid" fi done This has a few downsides: * User space processes which use an empty string in their command line won't be killed. * It results in errors during bootup because some shell-related processes are already terminated (maybe it's pgrep itself, haven't checked). * The @ is searched within the full command line, not just at the beginning of the string. Of course, we already had this until now, so it's not a problem of his implementation. I posted an alternative implementation which doesn't suffer from the first point, but even that one wasn't sufficient: for pid in $(pgrep -v -f '^@'); do readlink "/proc/$pid/exe" &> /dev/null || continue echo "$pid" done | xargs kill -9 This one spawns a subshell, which would be included in the processes to kill and actually kills itself during the process. So what we have now is even checking whether the shell process itself is in the list to kill and avoids killing it just to be sure. Also, we don't spawn a subshell anymore and use /proc/$pid/exe to distinguish between user space and kernel processes like in the comments of the following StackOverflow answer: http://stackoverflow.com/a/12231039 We don't need to take care of terminating processes, because what we actually want IS to terminate the processes. The only point where this (and any previous) approach falls short if we have processes that act like fork bombs, because they might spawn additional processes between the pgrep and the killing. We can only address this with process/control groups and this still won't save us because the root user can escape from that as well. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Fixes: #15226
2016-05-06 10:36:58 +00:00
#
# Storage daemons are distinguished by an @ in front of their command line:
# https://www.freedesktop.org/wiki/Software/systemd/RootStorageDaemons/
for pid in $(pgrep -v -f '^@'); do
nixos/stage-1: Don't kill kernel threads Unfortunately, pkill doesn't distinguish between kernel and user space processes, so we need to make sure we don't accidentally kill kernel threads. Normally, a kernel thread ignores all signals, but there are a few that do. A quick grep on the kernel source tree (as of kernel 4.6.0) shows the following source files which use allow_signal(): drivers/isdn/mISDN/l1oip_core.c drivers/md/md.c drivers/misc/mic/cosm/cosm_scif_server.c drivers/misc/mic/cosm_client/cosm_scif_client.c drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c drivers/staging/rtl8188eu/core/rtw_cmd.c drivers/staging/rtl8712/rtl8712_cmd.c drivers/target/iscsi/iscsi_target.c drivers/target/iscsi/iscsi_target_login.c drivers/target/iscsi/iscsi_target_nego.c drivers/usb/atm/usbatm.c drivers/usb/gadget/function/f_mass_storage.c fs/jffs2/background.c fs/lockd/clntlock.c fs/lockd/svc.c fs/nfs/nfs4state.c fs/nfsd/nfssvc.c While not all of these are necessarily kthreads and some functionality may still be unimpeded, it's still quite harmful and can cause unexpected side-effects, especially because some of these kthreads are storage-related (which we obviously don't want to kill during bootup). During discussion at #15226, @dezgeg suggested the following implementation: for pid in $(pgrep -v -f '@'); do if [ "$(cat /proc/$pid/cmdline)" != "" ]; then kill -9 "$pid" fi done This has a few downsides: * User space processes which use an empty string in their command line won't be killed. * It results in errors during bootup because some shell-related processes are already terminated (maybe it's pgrep itself, haven't checked). * The @ is searched within the full command line, not just at the beginning of the string. Of course, we already had this until now, so it's not a problem of his implementation. I posted an alternative implementation which doesn't suffer from the first point, but even that one wasn't sufficient: for pid in $(pgrep -v -f '^@'); do readlink "/proc/$pid/exe" &> /dev/null || continue echo "$pid" done | xargs kill -9 This one spawns a subshell, which would be included in the processes to kill and actually kills itself during the process. So what we have now is even checking whether the shell process itself is in the list to kill and avoids killing it just to be sure. Also, we don't spawn a subshell anymore and use /proc/$pid/exe to distinguish between user space and kernel processes like in the comments of the following StackOverflow answer: http://stackoverflow.com/a/12231039 We don't need to take care of terminating processes, because what we actually want IS to terminate the processes. The only point where this (and any previous) approach falls short if we have processes that act like fork bombs, because they might spawn additional processes between the pgrep and the killing. We can only address this with process/control groups and this still won't save us because the root user can escape from that as well. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Fixes: #15226
2016-05-06 10:36:58 +00:00
# Make sure we don't kill kernel processes, see #15226 and:
# http://stackoverflow.com/questions/12213445/identifying-kernel-threads
readlink "/proc/$pid/exe" &> /dev/null || continue
# Try to avoid killing ourselves.
[ $pid -eq $$ ] && continue
kill -9 "$pid"
done
if test -n "$debug1mounts"; then fail; fi
# Restore /proc/sys/kernel/modprobe to its original value.
echo /sbin/modprobe > /proc/sys/kernel/modprobe
# Start stage 2. `switch_root' deletes all files in the ramfs on the
# current root. The path has to be valid in the chroot not outside.
if [ ! -e "$targetRoot/$stage2Init" ]; then
stage2Check=${stage2Init}
while [ "$stage2Check" != "${stage2Check%/*}" ] && [ ! -L "$targetRoot/$stage2Check" ]; do
stage2Check=${stage2Check%/*}
done
if [ ! -L "$targetRoot/$stage2Check" ]; then
echo "stage 2 init script ($targetRoot/$stage2Init) not found"
fail
fi
fi
mkdir -m 0755 -p $targetRoot/proc $targetRoot/sys $targetRoot/dev $targetRoot/run
mount --move /proc $targetRoot/proc
mount --move /sys $targetRoot/sys
mount --move /dev $targetRoot/dev
mount --move /run $targetRoot/run
exec env -i $(type -P switch_root) "$targetRoot" "$stage2Init"
fail # should never be reached