From 09996e1762a7a14ce12de5297491d375ea1eb786 Mon Sep 17 00:00:00 2001 From: tristan Date: Tue, 19 Nov 2024 18:16:05 +0000 Subject: [PATCH] Add ZFS_SETUP.md --- ZFS_SETUP.md | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 ZFS_SETUP.md diff --git a/ZFS_SETUP.md b/ZFS_SETUP.md new file mode 100644 index 0000000..79050c6 --- /dev/null +++ b/ZFS_SETUP.md @@ -0,0 +1,282 @@ +# HE4 + + +## Hardware + +10x HDDs 16TB( 14.6 TiB) (Seagate Exos X18) 2x SSDs 960GB(894.3 GiB) +(1xSamsung 1xSeagate Nytro) + +## Filesystem +### Diagram +```mermaid +--- +config: + theme: dark +--- + +graph LR + +subgraph Devices[Devices] + direction TB + subgraph HDDs[HDDs] + HDD0 & HDD1 & HDD2 & HDD3 & HDD4 & HDD5 & HDD6 & HDD7 & HDD8 & HDD9 + end + subgraph NVMEs[NVMe Drives] + NVME0 --> nvme0n1["ns1"] + NVME1 --> nvme1n1["ns1"] & nvme1n2["ns2"] & nvme1n3["ns3"] + end +end + +subgraph Partitions[Partitions] + direction TB + subgraph HDD_Partitions[ ] + space_hdd_0_p & space_hdd_1_p & space_hdd_2_p & space_hdd_3_p & space_hdd_4_p & space_hdd_5_p & space_hdd_6_p & space_hdd_7_p & space_hdd_8_p & space_hdd_9_p + end + subgraph NVME_Partitions[ ] + subgraph NVME0_Partitions["nvme0n1 Partitions"] + p1["/boot"] & p2["crypted_nvme0_cache"] & p3["crypted_nvme0_special"] & p4["crypted_nvme0_log"] + end + nvme0n1 --> NVME0_Partitions + subgraph NVME1_Partitions[ ] + space_nvme1p1 & space_nvme1p2 & space_nvme1p3 + end + end +end + +subgraph LUKS[LUKS] + space_boot[ ] + crypt_hdd0 & crypt_hdd1 & crypt_hdd2 & crypt_hdd3 & crypt_hdd4 & crypt_hdd5 & crypt_hdd6 & crypt_hdd7 & crypt_hdd8 & crypt_hdd9 + subgraph NVME0_LUKS[ ] + crypt_nvme0_cache & crypt_nvme0_special & crypt_nvme0_log + end + + subgraph NVME1_LUKS[ ] + crypt_nvme1_cache & crypt_nvme1_special & crypt_nvme1_log + end +end + +subgraph ZFS_Pools[ZFS Pools] + subgraph vault[vault] + zdata["data (raidz2)"] & zcache["cache"] & zspecial["special (mirror)"] & zlog["log (mirror)"] + end +end + +subgraph Filesystem[Filesystem] + fs_root["/"] & fs_docker["/docker"] & fs_home["/home"] & fs_nix["/nix"] & fs_var["/var"] & fs_boot["/boot"] + p1 ---x space_boot --> fs_boot + vault --> fs_root & fs_docker & fs_home & fs_nix & fs_var +end + +HDD0 ---x space_hdd_0_p[ ] --> crypt_hdd0 +HDD1 ---x space_hdd_1_p[ ] --> crypt_hdd1 +HDD2 ---x space_hdd_2_p[ ] --> crypt_hdd2 +HDD3 ---x space_hdd_3_p[ ] --> crypt_hdd3 +HDD4 ---x space_hdd_4_p[ ] --> crypt_hdd4 +HDD5 ---x space_hdd_5_p[ ] --> crypt_hdd5 +HDD6 ---x space_hdd_6_p[ ] --> crypt_hdd6 +HDD7 ---x space_hdd_7_p[ ] --> crypt_hdd7 +HDD8 ---x space_hdd_8_p[ ] --> crypt_hdd8 +HDD9 ---x space_hdd_9_p[ ] --> crypt_hdd9 + +p2 ---> crypt_nvme0_cache +p3 ---> crypt_nvme0_special +p4 ---> crypt_nvme0_log +nvme1n1 --x space_nvme1p1[ ] --> crypt_nvme1_cache +nvme1n2 --x space_nvme1p2[ ] --> crypt_nvme1_special +nvme1n3 --x space_nvme1p3[ ] --> crypt_nvme1_log + +crypt_hdd0 & crypt_hdd1 & crypt_hdd2 & crypt_hdd3 & crypt_hdd4 & crypt_hdd5 & crypt_hdd6 & crypt_hdd7 & crypt_hdd8 & crypt_hdd9 ---o merge_data[ ] --> zdata +crypt_nvme0_cache & crypt_nvme1_cache ---o merge_cache[ ] --> zcache +crypt_nvme0_special & crypt_nvme1_special ---o merge_special[ ] --> zspecial +crypt_nvme0_log & crypt_nvme1_log ---o merge_log[ ] --> zlog + +classDef header fill:#2C3E50,color:#ECF0F1,stroke:none +classDef device fill:#34495E,color:#ECF0F1,stroke:none +classDef nvme fill:#2980B9,color:#ECF0F1,stroke:none +classDef partition fill:#3498DB,color:#ECF0F1,stroke:none +classDef luks fill:#1ABC9C,color:#ECF0F1,stroke:none +classDef zfs fill:#16A085,color:#ECF0F1 +classDef pool fill:#303030,color:#ECF0F1 +classDef filesystem fill:#2E8B57,color:#ECF0F1,stroke:none +classDef spacer fill:none,stroke:none +classDef space fill:none,stroke:none,height:0 + +%%classDef veryhigh height:4600 +%%class zdata,fs_root veryhigh + +class vault pool +class zdata,zcache,zspecial,zlog zfs +class crypt_hdd0,crypt_hdd1,crypt_hdd2,crypt_hdd3,crypt_hdd4,crypt_hdd5,crypt_hdd6,crypt_hdd7,crypt_hdd8,crypt_hdd9,crypt_nvme0_cache,crypt_nvme0_special,crypt_nvme0_log,crypt_nvme1_cache,crypt_nvme1_special,crypt_nvme1_log luks +class p1,p2,p3,p4 partition +class nvme0n1,nvme1n1,nvme1n2,nvme1n3 nvme +class HDD0,HDD1,HDD2,HDD3,HDD4,HDD5,HDD6,HDD7,HDD8,HDD9,NVME0,NVME1 device +class fs_boot,fs_docker,fs_home,fs_nix,fs_root,fs_var filesystem +class HDD_Partitions,NVME_Partitions,NVME1_Partitions space +class space_boot,space_hdd_0_p,space_hdd_1_p,space_hdd_2_p,space_hdd_3_p,space_hdd_4_p,space_hdd_5_p,space_hdd_6_p,space_hdd_7_p,space_hdd_8_p,space_hdd_9_p,space_nvme1p1,space_nvme1p2,space_nvme1p3 space +class merge_data,merge_cache,merge_special,merge_log,merge_pool space +``` +### Partitions +#### HDDs + +No partitions on hdds, use full disks + +### SSD0 + + | id | size | type | use | + |-----|---------|------|-----------------------------| + | 0 | 1GiB | efi | boot | + | 1 | 512GiB | luks | zfs cache(L2ARC) | + | 2 | 256GiB | luks | zfs special | + | 3 | 32GiB | luks | zfs slog(ZIL) | + | | 93.3GiB | free | under-provisioning for perf | + + +#### SSD1 + +use namespaces (same sizes as zfs partitions on SSD0) (no boot) + + + +## Cryptsetup + +``` fish +echo -n 'passphrase' > /tmp/passphrase.key + +# Data disks +for disk in /dev/sd*; + echo $disk + cryptsetup luksFormat \ + --sector-size 4096 \ + --type luks2 \ + --hash sha512 \ + --cipher aes-xts-plain64 \ + --key-size 512 \ + --key-file /tmp/passphrase.key \ + --batch-mode \ + $disk +end +# zfs parts on nvme0 +for part in /dev/nvme0n1p2 /dev/nvme0n1p3 /dev/nvme0n1p4 + echo $part + cryptsetup luksFormat \ + --sector-size 4096 \ + --type luks2 \ + --hash sha512 \ + --cipher aes-xts-plain64 \ + --key-size 512 \ + --key-file /tmp/passphrase.key \ + --batch-mode \ + $part +end +# zfs namespaces on nvme1 +for part in /dev/nvme1n* + echo $part + cryptsetup luksFormat \ + --sector-size 4096 \ + --type luks2 \ + --hash sha512 \ + --cipher aes-xts-plain64 \ + --key-size 512 \ + --key-file /tmp/passphrase.key \ + --batch-mode \ + $part +end +``` + +## LuksOpen + +``` fish + +# SSDs: +for tuple in \ + "/dev/nvme0n1p2 crypt_ssd0_cache" \ + "/dev/nvme0n1p3 crypt_ssd0_special" \ + "/dev/nvme0n1p4 crypt_ssd0_log" \ + "/dev/nvme1n1 crypt_ssd1_cache" \ + "/dev/nvme1n2 crypt_ssd1_special" \ + "/dev/nvme1n3 crypt_ssd1_log" + set split (echo $tuple | string split " ") + set disk $split[1]; set name $split[2]; + echo luksOpen $disk @ $name + cryptsetup luksOpen --key-file=/tmp/passphrase.key --perf-no_read_workqueue --perf-no_write_workqueue --allow-discards --persistent $disk $name +end + +# HDDs: +for index in (seq 0 9) + set disk /dev/sd(echo $index | tr "0-9" "a-j") + set name crypt_hdd$index + echo luksOpen $disk @ $name + cryptsetup luksOpen --key-file=/tmp/passphrase.key --persistent $disk $name +end +``` + +## Undo + +Close luks + +``` fish +for path in /dev/mapper/crypt*; + cryptsetup close $path +end +``` + +## ZFS Mod params + +param list: + +``` toml +l2arc_exclude_special=1 # are on same device, useless in our case +zfs_dirty_data_max=17179869184 # 16GB default 10% of ram so 12.8GB +zfs_txg_timeout=60 # default 5s, less fragmentation this way +l2arc_write_boost=67108864 # 64 MB +l2arc_write_max=16777216 # 16 MB (default 8MB) +``` + +nix config format for mod params + +``` nix +# boot.zfs.enableUnstable = true; +boot.extraModprobeConfig = '' +options zfs param=value ... + ''; +``` + +## Zpool + +``` fish +zpool create -n \ # dry run remove -n after checking + -f # zfs doesn't like mirrors and raidz in the same pool + -o ashift=12 \ # force 4k sectors, as 11/12 drives use those + -o autoreplace=on \ # let hetzner replace, everything else happens automagically + -o autotrim=on \ # TRIM, quite obivous, needs to be enabled on luks + -o feature@lz4_compress=enabled \ # faster compression + -o feature@large_blocks=enabled \ # allow bigger recordsize + -O recordsize=1M \ # use bigger recordsize (8 default) (we should change this for docker db) (16k for mysql) (8k for postgres but 16k seems to increase perf for sequential scans) + -O compression=on \ # default to best compression, updates when better algos get released + -O atime=on \ # access times are useful + -O relatime=on \ # but slow so lets use relatime + vault \ + raidz2 \ + /dev/mapper/crypt_hdd0 \ + /dev/mapper/crypt_hdd1 \ + /dev/mapper/crypt_hdd2 \ + /dev/mapper/crypt_hdd3 \ + /dev/mapper/crypt_hdd4 \ + /dev/mapper/crypt_hdd5 \ + /dev/mapper/crypt_hdd6 \ + /dev/mapper/crypt_hdd7 \ + /dev/mapper/crypt_hdd8 \ + /dev/mapper/crypt_hdd9 \ + cache \ + /dev/mapper/crypt_ssd0_cache \ + /dev/mapper/crypt_ssd1_cache \ + special \ + mirror \ + /dev/mapper/crypt_ssd0_special \ + /dev/mapper/crypt_ssd1_special \ + log \ + mirror \ + /dev/mapper/crypt_ssd0_log \ + /dev/mapper/crypt_ssd1_log +``` +