Commit 05c81fc7 authored by Ryan Jones's avatar Ryan Jones
Browse files

updated files

parent 5c5745b2
Simple playbooks to install OpenHPC version 1.0 using Ansible.
See the doc/README.md for a tutorial on using these scripts in a VirtualBox environment.
The Ansible layout is fairly simple, using a series of roles for different parts of the installation process.
This repo will get you to the point of a working slurm installation across your cluster. It does not
currently provide any scientific software or user management options!
The basic usage is to set up the master node with the initial 3 roles (pre\_ohpc,ohpc\_install,ohpc\_config)
and use the rest to build node images, and deploy the actual nodes (these use Warewulf as a provisioner by default).
Trigger the roles individually via tags, like:
```
ansible-playbook -t pre_ohpc -i inventory/headnode headnode.yml
```
None of these Ansible roles actually touch the compute nodes directly - at most, they build a new vnfs image and
trigger a reboot.
A more detailed description is available in the /doc folder.
[defaults]
retry_files_enabled = False
inventory = ./inventory/headnode
[ssh_connection]
control_path = ~/.ssh/ansible-%%r@%%h:%%p
This diff is collapsed.
---
#OpenHPC release version
openhpc_release_rpm: "https://github.com/openhpc/ohpc/releases/download/v1.3.GA/ohpc-release-1.3-1.el7.x86_64.rpm"
#The full list of available versions for CentOS can be generated via
# curl -s https://github.com/openhpc/ohpc/releases/ | grep rpm | grep -v sle | grep -v strong | sed 's/.*="\(.*\)".*".*".*/\1/'
#
# Headnode Info
public_interface: "eth0" # NIC that allows access to the public internet
private_interface: "eth1" #NIC that allows access to compute nodes
headnode_private_ip: "10.1.1.1"
build_kernel_ver: '3.10.0-957.1.3.el7.x86_64' # `uname -r` at build time... for wwbootstrap
#Private network Info
private_network: "10.1.1.0"
private_network_mask: "24"
private_network_long_netmask: "255.255.255.0"
compute_ip_minimum: "10.1.1.2"
compute_ip_maximum: "10.1.1.255"
gpu_ip_minimum: "10.1.1.128" #This could be more clever, like compute_ip_minimum + num_nodes...
#slurm.conf variables
cluster_name: "ohpc"
# gres_types: "gpu"
# sacct user list
cluster_users:
- vagrant # include each username on separate line as a list
#Stateful compute or not?
stateful_nodes: false
#Node Config Vars - for stateful nodes
sda1: "mountpoint=/boot:dev=sda1:type=ext3:size=500"
sda2: "dev=sda2:type=swap:size=500"
sda3: "mountpoint=/:dev=sda3:type=ext3:size=fill"
# GPU Node Vars
# download the nvidia cuda installer, and run with only --extract=$path_to_CRI_XCBC/roles/gpu_build_vnfs/files to get these three installers
nvidia_driver_installer: "NVIDIA-Linux-x86_64-387.26.run"
cuda_toolkit_installer: "cuda-linux.9.1.85-23083092.run"
cuda_samples_installer: "cuda-samples.9.1.85-23083092-linux.run"
# WW Template Names for wwmkchroot
template_path: "/usr/libexec/warewulf/wwmkchroot/"
compute_template: "compute-nodes"
gpu_template: "gpu-nodes"
login_template: "login-nodes"
# Chroot variables
compute_chroot_loc: "/opt/ohpc/admin/images/{{ compute_chroot }}"
compute_chroot: centos7-compute
gpu_chroot_loc: "/opt/ohpc/admin/images/{{ gpu_chroot }}"
gpu_chroot: centos7-gpu
login_chroot_loc: "/opt/ohpc/admin/images/{{ login_chroot }}"
login_chroot: centos7-login
# Node Inventory method - automatic, or manual
node_inventory_auto: true
#Node naming variables - no need to change
compute_node_prefix: "c"
num_compute_nodes: 1
gpu_node_prefix: "gpu-compute-"
num_gpu_nodes: 1
login_node_prefix: "login-"
num_login_nodes: 0
#OpenOnDemand
ood_nodename: "ood"
ood_version: 1.5
ood_ip_addr: 10.1.1.254
ood_rpm_repo: "https://yum.osc.edu/ondemand/{{ ood_version }}/ondemand-release-web-{{ ood_version }}-1.el7.noarch.rpm"
#Node Inventory - not in the Ansible inventory sense! Just for WW and Slurm config.
# Someday I will need a role that can run wwnodescan, and add nodes to this file! Probably horrifying practice.
# There is a real difference between building from scratch, and using these for maintenance / node addition!
#
compute_private_nic: "eth0"
compute_nodes:
- { name: "compute-1", vnfs: '{{compute_chroot}}', cpus: 1, sockets: 1, corespersocket: 1, mac: "08:00:27:EC:E2:FF", ip: "10.0.0.254"}
login_nodes:
- { name: "login-1", vnfs: '{{login_chroot}}', cpus: 8, sockets: 2, corespersocket: 4, mac: "00:26:b9:2e:21:ed", ip: "10.2.255.137"}
gpu_nodes:
- { name: "gpu-compute-1", vnfs: '{{gpu_chroot}}', gpus: 4, gpu_type: "gtx_TitanX", cpus: 16, sockets: 2, corespersocket: 8, mac: "0c:c4:7a:6e:9d:6e", ip: "10.2.255.47"}
viz_nodes:
- { name: "viz-node-0-0", vnfs: gpu_chroot, gpus: 2, gpu_type: nvidia_gtx_780, cpus: 8, sockets: 2, corespersocket: 4, mac: "foo", ip: "bar"}
#Slurm Accounting Variables - little need to change these
slurm_acct_db: "slurmdb"
slurmdb_storage_port: "7031"
slurmdb_port: "1234"
slurmdb_sql_pass: "password" #could force this to be a hash...
slurmdb_sql_user: slurm
#automatic variables for internal use
# Don't edit these!
compute_node_glob: "{{ compute_node_prefix }}[0-{{ num_compute_nodes|int - 1}}]"
gpu_node_glob: "{{ gpu_node_prefix }}[0-{{ num_gpu_nodes|int - 1}}]"
node_glob_bash: "{{ compute_node_prefix }}{0..{{ num_compute_nodes|int - 1}}}"
gpu_node_glob_bash: "{{ compute_node_prefix }}{0..{{ num_compute_nodes|int - 1}}}"
#Jupyter related
jupyter_provision: false
#EasyBuild variables
cluster_shared_folder: "/export"
easybuild_prefix: "{{ cluster_shared_folder }}/eb"
easybuild_tmpdir: "/tmp"
easybuild_buildpath: "/tmp/build"
easybuild_sourcepath: "/tmp/source"
#matlab install related
matlab_provision: false
matlab_download_url: "https://uab.box.com/shared/static/y01qu7oo1gpne6j2s6nqwcuee63epivo.gz"
matlab_clustershare: "/opt/ohpc/pub/apps/matlab/"
matlab_destination: "/tmp/matlab.tar.gz"
# module file vars
matlab_install_root: "/opt/ohpc/pub-master/apps/matlab/M2/"
matlab_docs_url: "http://{{ ood_nodename }}"
matlab_license_file: "{{ matlab_install_root }}/licenses/licenses.lic"
matlab_module_path: "{{ easybuild_prefix }}/modules/all"
matlab_module_appdir: "matlab"
matlab_module_file: "r2018a"
matlab_ver: "{{ matlab_module_file }}"
#SAS install related
sas_provision: false
sas_clustershare: "/export/apps/sas/"
sas_module_path: "{{ easybuild_prefix }}/modules/all"
sas_module_appdir: "sas"
sas_module_file: "9.4"
sas_ver: "{{ sas_module_file }}"
#Rstudio related
rstudio_provision: false
singularity_ver: '2.4.2'
r_versions:
- { full: '3.5.1', short: '3.5' }
- { full: '3.4.4', short: '3.4' }
#Copr Repos
enable_copr: true
copr_repos:
- { repo_name: "louistw/mod_wsgi-3.4-18-httpd24", host: ["{{ ood_nodename }}"] }
- { repo_name: "louistw/slurm-17.11.11-ohpc-1.3.6", host: ["{{ cluster_name }}", "{{ ood_nodename }}"] }
- { repo_name: "atlurie/shibboleth-3.0-ood", host: ["{{ ood_nodename }}"] }
# Shibboleth SSO
enable_shib: false
# User Registration
enable_user_reg: false
user_register_app: "flask_user_reg"
user_register_app_path: "/var/www/ood/register/{{ user_register_app }}"
user_register_app_repo: "https://gitlab.rc.uab.edu/mmoo97/flask_user_reg.git"
mod_wsgi_pkg_name: "uab-httpd24-mod_wsgi"
RegUser_app_user: "reggie"
RegUser_app_user_full_name: "RegUser of user register app"
RegUser_app_user_passwd: "qweasd"
# User Create Scripts
enable_user_create_scripts: false
user_create_scripts: "ohpc_user_create"
user_create_scripts_path: "/opt/{{ user_create_scripts }}"
user_create_script_repo: "https://gitlab.rc.uab.edu/tr27p/ohpc_user_create.git"
[headnode]
headnode ansible_host="{{ headnode_private_ip }}" ansible_connection=ssh ansible_ssh_user=root
......@@ -99,4 +99,4 @@
# vars:
# - compute_node_glob: "{{ compute_node_prefix }}[0-{{ num_compute_nodes|int - 1}}]"
# - node_glob_bash: "{{ compute_node_prefix }}{0..{{ num_compute_nodes|int - 1}}}"
# - last_node: "{{ node_prefix }}{{ num_nodes|int - 1 }}"
\ No newline at end of file
# - last_node: "{{ node_prefix }}{{ num_nodes|int - 1 }}"
---
# - name: fix broken wwmkchroot file
# lineinfile:
# dest: /usr/libexec/warewulf/wwmkchroot/centos-7.tmpl
# regexp: "^YUM_MIRROR(.*)7.2.1511(.*)"
# line: 'YUM_MIRROR\g<1>7\g<2>' # use \g<1> for backref followed by digit!
# backrefs: yes
#
- name: check current kernel version
shell: uname -r | sed "s/.$(uname -m)//"
register: running_kernel_version
- name: check most recent installed kernel version
shell: yum list installed | grep 'kernel\.' | tail -n 1 | awk '{print $2}'
register: installed_kernel_version
- fail:
msg: "Most recently installed kernel is not currently loaded version! Consider rebooting before building the vnfs"
when: running_kernel_version.stdout != installed_kernel_version.stdout
- fail:
msg: "Loaded kernel does not match the build_kernel_ver in group_vars/all"
when: running_kernel_version.stdout not in build_kernel_ver
- name: remove old vnfs if it exists
file:
path: "{{ compute_chroot_loc }}"
state: absent
- template: src=compute_template.j2 dest="{{ template_path }}{{ compute_template }}.tmpl"
- template: src=extend_compute_packages.j2 dest="{{ template_path }}extend_compute_packages"
- template: src=base_packages.j2 dest="{{ template_path }}base_packages"
- name: make chroot
command: wwmkchroot "{{ compute_template }}" "{{ compute_chroot_loc }}"
- name: copy resolve.conf into image
copy: src=/etc/resolv.conf dest="{{ compute_chroot_loc }}/etc/resolv.conf" #"
- name: yum install into the image chroot
yum:
state: present
installroot: "{{ compute_chroot_loc }}"
name:
- chrony
- 'kernel-{{ running_kernel_version.stdout }}'
- lmod-ohpc
- grub2
- freeipmi
- ipmitool
- ohpc-slurm-client
- ohpc-base-compute
- tmux
- ruby
- turbojpeg
- nc
- '@X Window System'
- '@Xfce'
# one method to install TurboVNC
- name: download TurboVNC rpm
get_url:
url: https://sourceforge.net/projects/turbovnc/files/2.2/turbovnc-2.2.x86_64.rpm
dest: /var/tmp/turbovnc-2.2.x86_64.rpm
checksum: md5:25711ad32bfae63031aff20528d4af79
- name: install TurboVNC via rpm into chroot image
yum:
name: /var/tmp/turbovnc-2.2.x86_64.rpm
state: present
installroot: "{{ compute_chroot_loc }}"
# Another method to install TurboVNC, tested
# All information comes from TurboVNC official website:
# https://turbovnc.org/pmwiki/uploads/Downloads/TurboVNC.repo
# - name: add TurboVNC repo into yum inside compute node image
# yum_repository:
# name: TurboVNC
# description: TurboVNC official RPMs
# baseurl: https://sourceforge.net/projects/turbovnc/files
# gpgcheck: yes
# gpgkey: http://pool.sks-keyservers.net/pks/lookup?op=get&search=0x6BBEFA1972FEB9CE
# exclude: 'turbovnc-*.*.9[0-9]-*' # exclude beta releases
# reposdir: "{{ compute_chroot_loc }}/etc/yum.repos.d"
#
# - name: install TurboVNC via yum into chroot image
# yum:
# name: turbovnc
# state: present
# installroot: "{{ compute_chroot_loc }}"
- name: download Websockify source code
get_url:
url: https://github.com/novnc/websockify/archive/v0.8.0.tar.gz
dest: /var/tmp/websockify-0.8.0.tar.gz
- name: extract Websockify source code into chroot env
unarchive:
src: /var/tmp/websockify-0.8.0.tar.gz
dest: '{{ compute_chroot_loc }}/tmp'
- name: install Websockify inside chroot env
command: "chroot {{ compute_chroot_loc }} /bin/bash -c 'cd /tmp/websockify-0.8.0; python setup.py install'"
# After we installed Xfce, the compute node is set to bootup in graphical mode.
# This task is to unset that back to multi-user mode.
- name: set compute node to boot with multi-user mode
command: chroot '{{ compute_chroot_loc }}' systemctl set-default multi-user.target
- name: put NFS home mount info in image
lineinfile: line="{{ headnode_private_ip }}:/home /home nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" dest={{ compute_chroot_loc }}/etc/fstab state=present
- name: put NFS opt mount info in image
lineinfile: line="{{ headnode_private_ip }}:/opt/ohpc/pub /opt/ohpc/pub-master nfs nfsvers=3 0 0" dest={{ compute_chroot_loc }}/etc/fstab state=present
- name: put NFS opt mount info in image
lineinfile: line="{{ headnode_private_ip }}:/export /export nfs nfsvers=3 0 0" dest={{ compute_chroot_loc }}/etc/fstab state=present
- name: firewalld on compute image disabled
command: chroot '{{ compute_chroot_loc }}' systemctl disable firewalld
- name: chronyd on compute image enabled
command: chroot '{{ compute_chroot_loc }}' systemctl enable chronyd
- name: add headnode to compute chrony.conf
lineinfile: line="server {{ headnode_private_ip }}" dest={{ compute_chroot_loc }}/etc/chrony.conf state=present
- name: slurmd on compute image enabled
command: chroot '{{ compute_chroot_loc }}' systemctl enable slurmd
- name: wwimport file into image (passwd)
command: wwsh file import /etc/passwd
- name: wwimport file into image (group)
command: wwsh file import /etc/group
- name: wwimport file into image (shadow)
command: wwsh file import /etc/shadow
- name: wwimport file into image (slurm)
command: wwsh file import /etc/slurm/slurm.conf --name slurm.conf
- name: wwimport file into image (munge)
command: wwsh file import /etc/munge/munge.key
- name: wwimport file into image (lmod.sh)
command: wwsh file import /etc/profile.d/lmod.sh
- name: wwimport file into image (lmod.csh)
command: wwsh file import /etc/profile.d/lmod.csh
- name: build bootstrap image
shell: wwbootstrap {{ build_kernel_ver }}
- name: build the vnfs
command: wwvnfs -y --chroot "{{ compute_chroot_loc }}/"
- name: set up provisioning interface
lineinfile: line="GATEWAYDEV={{ private_interface }}" dest=/tmp/network.ww create=yes
#" for vim
#
- name: add network file to import
command: wwsh -y file import /tmp/network.ww --name network
- name: set network file path
command: wwsh -y file set network --path /etc/sysconfig/network --mode=0644 --uid=0
PKGLIST="basesystem bash redhat-release chkconfig coreutils e2fsprogs \
ethtool filesystem findutils gawk grep initscripts iproute iputils \
mingetty mktemp net-tools nfs-utils pam portmap procps psmisc rdate rsync \
sed setup shadow-utils rsyslog tcp_wrappers tzdata util-linux words zlib \
tar less gzip which util-linux module-init-tools udev openssh-clients \
openssh-server dhclient pciutils vim-minimal shadow-utils strace cronie \
crontabs cpuspeed cpufrequtils cpio wget yum numactl libicu"
#DESC: A clone of Red Hat Enterprise Linux 7
# The general RHEL include has all of the necessary functions, but requires
# some basic variables specific to each chroot type to be defined.
# with additional procedure to add packages from variable EXTEND_COMPUTE
. include-rhel-xcbc
# Define the location of the YUM repository
# YUM_MIRROR="http://mirror.centos.org/centos-7/7/os/\$basearch/"
YUM_MIRROR="http://mirror.centos.org/centos-7/7/os/\$basearch/"
# Include the basic packages
. base_packages
# Additional packages to get closer to the definition of compute node I had in rocks.
. extend_compute_packages
ADDITIONALPACKAGES=( "$EXTEND_COMPUTE" )
# vim:filetype=sh:syntax=sh:expandtab:ts=4:sw=4:
EXTEND_COMPUTE="PyPAM abrt-addon-ccpp abrt-addon-kerneloops abrt-addon-python abrt-cli \
abrt-python aide alsa-utils atlas atlas-sse3 audispd-plugins augeas-libs\
authd biosdevname blktrace bridge-utils brltty cim-schema cpupowerutils\
crash-gcore-command crash-trace-command device-mapper-multipath device-mapper-persistent-data \
dstat dumpet edac-utils fftw fftw-devel fftw-static flex flex-devel fprintd-pam \
freeglut GConf2 gdb-gdbserver gdk-pixbuf2 glibc-utils glibc-devel.i686 gnuplot gsl gsl-devel \
hardlink hunspell i2c-tools iotop json-c lapack latencytop latencytop-tui latrace \
ledmon linuxptp lm_sensors lksctp-tools ltrace lvm2 memtest86+ ncurses-term numpy \
oprofile oprofile-jit papi perf powertop python-volume_key rfkill rsyslog-gnutls rsyslog-gssapi rsyslog-relp \
scipy scl-utils sdparm sg3_utils sox squashfs-tools star strace tboot \
trace-cmd udftools units uuidd valgrind vim-X11 vim-enhanced \
virt-what volume_key wodim x86info zsh SDL abrt abrt-libs abrt-tui audit autoconf automake \
blas dejavu-fonts-common dejavu-sans-fonts device-mapper-multipath-libs flac \
fontpackages-filesystem fprintd giflib gnuplot-common gsm jline jpackage-utils latencytop-common libcmpiCppImpl0 \
libao libasyncns libfprint libesmtp libjpeg-turbo-devel libIDL libproxy libproxy-bin libproxy-python \
librelp libreport libreport-cli libreport-compat libreport-filesystem libreport-plugin-kerneloops \
libreport-plugin-logger libreport-plugin-mailx libreport-plugin-reportuploader libreport-plugin-rhtsupport \
libreport-plugin-ureport libreport-python libsamplerate libsndfile \
libtar libXdmcp libxkbfile libxshmfence lvm2-libs numpy-f2py ORBit2 pulseaudio-libs pycairo \
python-argparse python-crypto python-dateutil python-matplotlib python-nose python-paramiko \
python-setuptools pytz qt-sqlite rhino satyr sg3_utils-libs sgml-common suitesparse theora-tools \
trousers tzdata-java vim-common vim-filesystem volume_key-libs wavpack xkeyboard-config xinetd xmlrpc-c \
xmlrpc-c-client xorg-x11-server-common xorg-x11-server-Xvfb xorg-x11-xkb-utils xterm libwsman1 \
net-snmp-utils openwsman-client openwsman-server perl-Compress-Raw-Zlib perl-Compress-Zlib perl-HTML-Parser \
perl-HTML-Tagset perl-IO-Compress-Base perl-IO-Compress-Zlib perl-libwww-perl perl-URI sblim-sfcb sblim-sfcc"
---
# - name: print single node info
# debug:
# var: item.mac
# with_items: "{{ gpu_nodes }}"
- block:
- name: add node to ww db
command: wwsh -y node new {{ item.name }} --ipaddr={{ item.ip }} --hwaddr={{ item.mac }} -D {{ private_interface }}
with_items: "{{ gpu_nodes }}"
- name: blacklist nouveau on first boot
command: wwsh -y object modify -s kargs='modprobe.blacklist=nouveau,quiet' -t node {{ item.name }}
with_items: "{{ gpu_nodes }}"
- name: set nodes bootloader
command: wwsh -y object modify -s bootloader=sda -t node {{ item.name }}
with_items: "{{ gpu_nodes }}"
- name: set nodes partitions
command: wwsh -y object modify -s diskpartition=sda -t node {{ item.name }}
with_items: "{{ gpu_nodes }}"
- name: format partitions
command: wwsh -y object modify -s diskformat=sda1,sda2,sda3 -t node {{ item.name }}
with_items: "{{ gpu_nodes }}"
- name: define filesystems
command: wwsh -y object modify -s filesystems="{{ sda1 }},{{ sda2 }},{{ sda3 }}" -t node {{ item.name }}
with_items: "{{ gpu_nodes }}"
#" for vim
- name: remove node from slurm.conf if it exists already # to avoid duplication!
lineinfile:
dest: /etc/slurm/slurm.conf
regexp: "^NodeName={{ item.name }}"
state: absent
with_items: "{{ gpu_nodes }}"
- name: add node to slurm.conf
lineinfile:
dest: /etc/slurm/slurm.conf
line: "NodeName={{ item.name }} Gres=gpu:{{ item.gpu_type }}:{{ item.gpus }} Sockets={{ item.sockets }} CoresPerSocket={{ item.corespersocket }} State=UNKNOWN"
insertbefore: "^# PARTITIONS"
state: present
with_items: "{{ gpu_nodes }}"
- name: remove node from gres.conf if it exists already # to avoid duplication!
lineinfile:
dest: /etc/slurm/gres.conf
regexp: "^NodeName={{ item.name }}"
state: absent
with_items: "{{ gpu_nodes }}"
- name: add node info to slurm/gres.conf
lineinfile:
dest: /etc/slurm/gres.conf
line: "NodeName={{ item.name }} Name=gpu Type={{ item.gpu_type }} File=/dev/nvidia[0-{{ item.gpus - 1 }}]"
insertafter: "^#######"
state: present
with_items: "{{ gpu_nodes }}"
when: node_inventory_auto == false # END NON-AUTO-INVENTORY BLOCK
- name: add nodes via wwnodescan - BOOT NODES NOW, IN ORDER
shell: wwnodescan --ip={{ gpu_ip_minimum }} --netdev={{ private_interface }} --netmask=255.255.255.0 --bootstrap={{ build_kernel_ver }} --vnfs={{ compute_chroot }} {{ gpu_node_glob_bash }}
when: node_inventory_auto == true
- name: blacklist nouveau on first boot
command: wwsh -y object modify -s kargs='modprobe.blacklist=nouveau,quiet' -t node "{{ gpu_prefix}}*"
when: node_inventory_auto == true
- name: set files to provision
command: wwsh -y provision set {{ gpu_node_glob }} --vnfs={{ gpu_chroot }} --bootstrap={{ build_kernel_ver }} --files=passwd,group,shadow,munge.key,slurm.conf,dynamic_hosts,network,gres.conf
when: node_inventory_auto == true
- name: wwsh file sync
command: wwsh file sync
- name: restart dhcp
service: name=dhcpd state=restarted
- name: update pxeconfig to let node boot from pxe
command: wwsh -y object modify -D bootlocal -t node {{ gpu_node_glob }}
when: stateful_nodes == false and node_inventory_auto == true
- name: update pxeconfig to let node boot from local disk
command: wwsh -y object modify -s bootlocal=EXIT -t node {{ gpu_node_glob}}
when: stateful_nodes == true and node_inventory_auto == true
- name: wwsh pxe update
command: wwsh -v pxe update
register: command_result
failed_when: "'Building iPXE' not in command_result.stdout and 'Building Pxelinux' not in command_result.stdout"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment