Commit 50f8e5a4 authored by Angelina Elizabeth Uno-Antonison's avatar Angelina Elizabeth Uno-Antonison
Browse files

A Snapshot version of the CGDS manual of operations from June 19th, 2020.

FROM python:3.7-alpine as development
RUN pip install mkdocs==1.0.4
RUN mkdir /mkdocs
VOLUME ["/mkdocs"]
WORKDIR /mkdocs
CMD ["mkdocs", "serve"]
FROM python:3.7-alpine as build
RUN pip install mkdocs==1.0.4
COPY "." "/mkdocs"
WORKDIR /mkdocs
RUN mkdocs build
FROM nginx:1.17-alpine as production
COPY --from=build /mkdocs/site /usr/share/nginx/html
\ No newline at end of file
pipeline {
agent any
environment {
GITLAB_API_TOKEN = credentials('GitLabToken')
stages {
stage('Static Analysis') {
agent {
docker { image ''}
steps {
wrap([$class: 'AnsiColorBuildWrapper', 'colorMapName': 'XTerm']) {
sh '/bin/'
post {
success {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=success&name=jenkins_static_analysis\""
failure {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=failed&name=jenkins_static_analysis\""
stage('build') {
steps {
sh 'docker-compose -f docker-compose.production.yml build moo-production'
post {
success {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=success&name=jenkins_system_tests\""
failure {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=failed&name=jenkins_system_tests\""
stage('Publish Updates') {
when { branch 'master' }
steps {
// note -- using `weborg` until we have CGDS @TODO swap out
sh 'docker login -u weborg -p ${GITLAB_API_TOKEN}'
sh 'docker-compose -f docker-compose.production.yml push'
post {
success {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=success&name=updates_published\""
failure {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=failed&name=updates_published\""
stage('swarm deploy') {
when { branch 'master' }
steps {
sh 'docker stack deploy --prune --with-registry-auth --compose-file docker-compose.production.yml moo-prod'
post {
success {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=success&name=swarm_deployed\""
failure {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=failed&name=swarm_deployed\""
post {
success {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=success&name=jenkins\""
failure {
sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"${GIT_COMMIT}?state=failed&name=jenkins\""
\ No newline at end of file
# Center for Computational Genomics and Data Science Manual
The Manual of Operations (MOO) for the Center of Computational Genomics and Data Science provides guidance to members
of the center its mission and operations. The manual of operations is a statically generated website using
[MkDocs]( which creates a static website from MarkDown files. The following website
<> is a useful guide and cheatsheet to MarkDown syntax for formatting.
## Local Development with Docker
### Docal Pre-requesites
* [Docker](
### Deploy Locally within Docker
If you do not want to install Python or MkDocs directly onto your system, you can use docker and docker-compose to
develop with locally. To start running the self-hosted mkdocs service to live update execute the following.
> `docker-compose up`
## Local Development native on computer
### Native Pre-requesites
* [MkDocs](
### Installation
#### MacOs
> [Homebrew](
`brew install mkdocs`
#### Windows
> [Choclately](
`choco install mkdocs`
## Deploy Locally
Use the following command to localy update and view hotloaded changes live.
> `mkdocs serve`
## Add new pages
There are two steps to add new pages to the manual of operations.
> 1. Create a new MarkDown file within the `docs/` directory. MarkDown file names use Kebab Case naming scheme.
An example is ``.
> 2. Open the `mkdocs.yml` file within the root project directory. Add a new line under the `nav` section within
the YAML file with the following format `- <Page Name>: <>`. The MkDocs shall order the menu
the as the`nav` section in the `mkdocs.yml`.
## Production
The following commands can only be executed if MkDocs is directly installed on the machine. However, this will rarely
be needed when updating the MOO.
### Build Website
> `mkdocs build`
### Build and Deploy to Gitlab Pages
Reference the `.gitlab-ci.yml` file for configuration of Gitlab Pages continuous integration for deploying the mkdocs
generated website to Gitlab pages.
> `mkdocs build -d public`
version: '3.4'
context: .
target: production
- "traefik.enable=true"
- ""
- "traefik.docker.lbswarm=true"
- "traefik.http.routers.moo-production.rule=PathPrefix(`/manual-of-operations`)"
- "traefik.http.routers.moo-production.service=moo-production"
- "traefik.http.routers.moo-production.middlewares=moo-strip-prefix"
- "traefik.http.middlewares.moo-strip-prefix.stripprefix.prefixes=/manual-of-operations"
- ""
- traefik
name: traefik-public
\ No newline at end of file
version: '3.4'
context: .
target: development
- .:/mkdocs/
- "8000:8000"
context: .
target: production
- "80:80"
- "traefik.backend=moo-production"
- "traefik.http.routers.moo-production.rule=PathPrefix(`/manual-of-operations`)"
- "traefik.http.middlewares.moo-strip-prefix.stripprefix.prefixes=/manual-of-operations"
- "traefik.http.routers.moo-production.middlewares=moo-production"
- "traefik.port=80"
external: true
# The Center for Computational Genomics and Data Science (CGDS)
## Team Purpose
The Center for Computational Genomics and Data Science (CGDS) seeks to develop and apply tools and methods to uncover
new and novel information that provide benefits for patients suffering from rare, undiagnosed, or misdiagnosed diseases,
their families, and the people that care for them. CGDS focuses on interpreting molecular variation with regards to its
correlation with patient phenotypes. We seek to identify and interpret causal molecular variation as well as variations
that alter patients’ clinical severity and progression through modification of phenotype or through alteration of response
to therapeutics (pharmacogenomics). Although we aim to generate knowledge that can be utilized for the study of human
disease in general, we strive to produce and apply knowledge for individuals (patients, their families, and the clinicians
who take care of them) in need of information to assist with clinical decision making in the present. CGDS diligently
works to create and maintain a productive and friendly team environment where we continuously educate one another,
contributeto scientific education, and disseminate new information locally and globally.
The Center for Computational Genomics and Data Science (CGDS) seeks to advance delivery of precision medicine to
patients and to advance research discoveries through the application multi-omics data science and innovations in alignment
with the UAB School of Medicine (SOM) goals. Transfer of advancements is accomplished through education, patient care,
basic research, translational research, and clinical research.
## Duration and Time Commitment
The CGDS work week consists of core hours that each member is expected to include in their workday; remaining hours in
the work day are flexible. We ask that you do your best to be in your office during core hours and schedule non-work
appointments outside those core hours. Flexibility for team members is absolutely supported and does not reflect a lack
of their commitment, but rather an acknowledgment of how life really happens.
In return for this flexibility, team members are asked to be flexible in return, by working in the evenings or on
weekends in exchange for taking time off during the week. Absences that can be planned outside of core hours (like
doctor/dentist checkups) should be planned for early or late in the day. Minimizing absences within core hours whenever
possible will help to dispel misunderstandings and reduce project impact.
If you have an emergency or sickness, notify Liz and communicate with the team as soon as you can. Your commitment to
communicating planned and unplanned events will reduce the possibility of tension and/or conflicts.
## Scope
At this point, the science in our areas of interest is just beginning to unravel the complexities of a multitude of
questions. It’s important for us to keep focus on our goals and keep project scope in manageable tasks.
We also acknowledge that every project we take on, contributes towards improving patient outcomes and patient’s quality
of life even when negative results are achieved. This approach demands perseverance to stay on track, pursuing wherever
the data may lead in planned increments.
1. Development of expanded capabilities in the area of molecular diagnostics for oncology and rare disease and other
mis- or undiagnosed diseases.
2. Organization of data science innovations across the UAB SOM as a whole.
3. Driving or enabling differentiation and growth for key UAB clinical service lines using omics-based data science.
4. Supporting the empowerment of physicians by providing the information they need to enable data science driven
precision care.
5. Development of grant revenue streams for Data Science related to; precision medicine, phenotype-to-genotype studies,
polygenic risk score analyses, pharmacogenomics, molecular diagnostics, etc.
6. Development of materials for education of students, residents, and healthcare providers.
7. Evaluation, prioritization, and development of partnership and other opportunities both within and external to UAB.
8. Development of philanthropic opportunities in the area of -omics data science in support of UAB SOM missions.
## Keys to Success
Team success is achieved by building on existing talent, relationships, investments, and accomplishments. Success also
relies on strengthening intra- and inter- institution communications and collaborations. It is important to initiate,
support, and reinforce interdisciplinary collaborations among our CGDS teams and with UAB researchers, clinicians,
pathologists, laboratory administrators, and other specialists and other teams who bring important knowledge to these
efforts. Although there may be a gap between research goals and the application of new discoveries in clinically-useful
manner, our team will work towards development of tests and procedures that useful for patients and can be reimbursed
through health insurance providers.
## Members (Nov 2019)
In order to meet the needs of our distinctive units, the Department of Pathology, the Department of Pediatrics, the UAB
School of Medicine Omics-based Data Science Program, and the clinical- and research-focused Worthey Lab Research Program,
teams of software developers, data scientists, statisticians and researchers peopled by junior and senior research
scientists, and students will focus their efforts into Software Development, Data Science, and Molecular Variation
Interpretation. Many CGDS members will work across many of these foci including an administrative associate, the lab
manager, a system administrator, and research IT. This structure allows CGDS to define teams and reporting structures,
and ensures cohesion among the teams given that success will require significant amounts of cross team interaction.
### Dr. Liz Worthey
**Director** for the _Center for Computational Genomics and Data Science_ within the departments of _Pediatrics and
Pathology_ in _UAB School of Medicine_
**Director** of the _Bioinformatics Section in Division of Genomics Diagnostics and Bioinformatics_ within the
department of _Pathology_ in _UAB School of Medicine_
**Associate Director** in _Hugh Kaul Precision Medicine Institute_
**Associate Professor** _Pediatrics and Pathology_
**Scientist** the _O’Neal Comprehensive Cancer Center_
### Arthur Weborg
Software Architect - Genomics
### Angelina Uno-Antonison
Software Developer III - Genomics
### Donna Brown
Molecular Variant Analyst
### Brandon Wilk
Software Developer III - Genomics
### Dr. Manavalan Gajapathy
Data Scientist - Genomics
### Alex Moss
Software Developer I - Genomics
## Team Sponsor Dr. Selwyn Vickers
Senior Vice President of Medicine
Dean of the University of Alabama at Birmingham School of Medicine
Dr. Vickers has held these posts since October 2013. He is a world-renowned surgeon, pancreatic cancer researcher, and
pioneer in health disparities research. Dr. Vickers is a member of the National Academy of Medicine
(Institute of Medicine) and of the Johns Hopkins Society of Scholars. Dr. Vickers personally recruited this team to UAB.
## Other sponsors or key stakeholders
* **Dr. Matt Might**
* **Dr. George Netto**
* **Dr. Bruce Korf**
* **Dr. Mitch Cohen**
## Desired End Result
We will focus on development and application of tools and methods to identify and interpret causal molecular variation
i.e. ending of diagnostic odysseys, as well as identification of variation that otherwise alters a patient's clinical
picture through modification of their phenotype or through alteration of response to therapeutics (pharmacogenomics).
Critically our goals are clinical; although we aim to generate generalizable knowledge for future studies, we strive to
produce and apply knowledge for these individuals today; not ten years down the road in their journey.
## Supporting Resources
* [UAB School of Medicine Pediatrics Department](
* [UAB School of Medicine Pathology Department](
* Pediatrics IT
* [Children's of Alabama](
* [UAB IT Research Computing Department](
* [Hugh Kaul Precision Medicine Institute](
* [O'Neal Comprehensive Cancer Center](
The supporting resources include other people not assigned as team members but who aid our overall purpose. Other
resources may be called upon to supply needed resources (blue prints, meeting rooms, travel budgets, corporate
authority, software, etc.)
## Management of Projects
Due to the vast variety of project types within the CGDS, each project shall have a project charter using the CGDS
project charter template. The Project Charter will be shared among all project members to communicate scope and work
required. In regards to reports, the charter informs report format, method of delivery, frequency of delivery, and to
whom it must be delivered to.
## Deliverable
Project charters characterize the summation of work within the CGDS. This document defines the project intentions,
resources, activities, and goals. Each project charter uses the CGDS project charter template as a guide to ensure all
members understand the scope and purpose of the work. Goals within the project charter must map directly to the goals
of the CGDS which correspond directly from the UAB School of Medicine. The individual goals for members of the CGDS
must fall within the scope of the project’s goals. Project charters also define project status reports and final
project deliverables.
# UAB Compute Cluster a.k.a. Cheaha
## Overview
Cheaha is a large, multi-unit computational system for running massively
parallel compute tasks. It is managed by the [UAB Research Computing Group](
Cheaha is currently the fastest supercomputer in the state of Alabama with a theoretical throughput of
approximately 450 TFlop/s (HUGE COMPUTE!) and consists of over 3000 CPU cores and 72 NVIDIA-P100 GPU's. Cheaha is
supported by a high-speed parallel filesystem (GPFS) that can store 6 PB non-redundantly and 4 PB redundantly (with
more to come!) interconnected by a high speed infiniband network. UAB researchers use Cheaha for wide variety of
research such as genomics, neuro-imaging, machine learning, statistical genetics, cancer detection etc.
Use of this resource is governed by the
[UAB Acceptable Use Policy for Computer and Network Resources](
For more information on Cheaha and the tools available to support research please review the documentation:
## Access
To get setup with cluster access you'll need your BlazerID and send an email to the cluster
support group (``).
You can use this template email filled in with your information to make this request.
My name is __YOUR_NAME__ and I’m a __TITLE__ in Dr. Liz Worthey’s lab.
I’d like to request access to the cluster for our Genomics, Genetics and Data
Science research. In particular I will be doing data analysis, pipeline
development, and genomics research using the compute resources of the cluster.
Dr. Liz Worthey’s Lab
Center For Computational Genomics and Data Science
## Storage spaces
* Scratch Space
* 1 TB of fast storage (i.e. close to the compute for super fast input/output)
* Home Space
* 50 GB of fast-ish storage for small data, scripts, small analyses, etc.
* User Data Directory
* 20 TB of fast-ish storage for larger data needs
* Lab/Project Space
* 50 - 100 TB per lab of fast-ish storage for project level data and analysis
* Commodity Storage (coming soon!)
* ??? TB of slower storage but HUGE for bigger datasets
## Submitting Jobs
You can SSH into the cluster via
The cluster uses the Slurm queue management system (stands for Simple Linux Utility for Resource Management) for
scheduling, distributing, and managing compute "jobs". A "job" is just a general term used to describe doing a specific
task, or set of tasks (specified in script) on the compute contained within the cluster.
For a complete description and tutorial of writing and executing jobs on the cluster see Research Computing's helpful
[guide]( on Slurm and executing compute tasks on the cluster. You can also check
out the below tutorial for a quick high level view of the cluster.
## Python on the Cluster
Anaconda is a free and open-source distribution of the Python and R programming languages for scientific computing, that
aims to simplify package management and deployment. Package versions are managed by the package management system conda.
CDGS plans on using conda on the cluster for multiple projects involving the use of python.
### Conda Shortcuts for cluster
* Enabling Conda Module on Cluster
`module avail Anaconda`
* Creating new Conda Environment
`conda create --name test_env`
Packages can be included within the new environment with a similar command
`conda create --name test_env PACKAGE_NAME`
* List available virtual environments available
`conda env list`
Virtual environment with the asterisk(\*) next to it is the one that's currently active
* Activating conda virtual environment
`source activate test_env`
* Deactivating Virtual Environment
`source deactivate`
* Export Conda virtual environmnet to share
`conda env export -n test_env > environment.yml`
* Creating Conda Virtual Environment from environment.yml
`conda env create -f environment.yml -n test_env`
* Deleting a Conda Virtual Environment
`conda remove --name test_env --all`
For a complete tutorial and for a most up-to-date version, please use the tutorial from
[UAB Research Compute's Anaconda Wiki](
## Briefings and Highlights
# Continuing Education
All of us are expected to be engaged in continuing to build our expertise in pursuit of achieving our goal to advance
science. As a member of this group you are required to document your efforts by **accumulating at least 12 continuing
education (CE) hours per year**, with **at least one hour in every quarter**.
## CE Activities
Any of the following activities can satisfy that goal.
* Attend seminars related to project goals
* Attend individual sessions at meetings/conferences (document each session/talk separately)
* Participate in workshops
* Attend Grand Rounds in Birmingham or Huntsville
* Complete an online course connected to your work
* Working on higher education training relevant to the field or position held (i.e. a Masters, Ph. D., etc.)
## CE Logging
Create and maintain a log of your activities towards this goal for each calendar year. A template is set up in
[Box/CGDS/General/Continuing Education]( Open the template then
Save As and add your name to the file name within the /Continuing_Education folder. Add new CE activities to this same
file throughout your tenure with CGDS.
For each activity record:
* Title
* Activity type
* Location (URL if online course, course(s) code when applicable)
* Date(s)
* Time spent:
Brief statement describing the activity and it furthered your knowledge
For example:
Grand Rounds @ Hsv 24-Sep-19 45min
M+M presentation by 3rd yr resident on patient w/Pyridoxine-dependent epilepsy and the
problems encountered getting patient treatment established due to mistakes and unnecessary
delays. Note: Perhaps EHR could support rare disease alerts for professionals. Also resident
was unaware of how to test for this condition so some education in OMIM, and other databases
to support diagnosis would be useful.
If you have other activities you would like to add to this list to fulfill the requirements, please check with Liz to
get approval. Also note that HIPAA training does not count toward this requirement.
# Data Policies and Standards
## Locations
The following table describes what types of data/files should go where for CGDS
| Data Type | Examples | Storage |Location |
| Anything containing PHI | Pictures, Medical Records, Clinical Descriptions | Box |
<> |
| Project analysis files not used for computation | IRB Documents, project charter, deidentified data | Box |
CGDS/Projects/{project identifier}/ |
| Project sequence files | Fastqs | Cluster |
/data/project/worthey_lab/projects/{project identifier}/raw/{sample identifier}/ |
| Project analysis files used for computation | BAM, VCF, output from other tools | Cluster |
/data/project/worthey_lab/projects/{project identifier}/analysis/{sample identifier}/ |
## Box
In general here's what the overall layout should be of the Box directories:
├── Alexander and Worthey Labs Collaboration
└── CGDS
├── General
│   ├── Grants\ Info
├── Job\ Descriptions
│   ├── LabCharter
│   └── MeetingNotes
└── Projects
├── CF\ Projects
│   ├── CF-Brothers\ Three
│   ├── CF-First
│   └── Gerber\ CF
├── Data\ Requests
├── KidsNetwork\ RO3
└── TSC
### Collaborations
From time to time, it is important for our lab to share files with other labs. To this end, we will create collaborative
directories in Box that grant access between our lab and the collaborators lab. For example, a collaboration directory
for work done with the Alexander lab would exist at the same level as the Root CGDS directory (see below). When first
entering Box you would see the directory `Alexander and Worthey Labs Collaboration` at the same level as the `CGDS`
directory (see above diagram of the directory structure).
Remember the collaboration directory is for sharing large-ish files and results between the labs. Permissions are set
according to the particular collaboration needs.
### Root CGDS Directory
The root directory in Box is for all files related to the CGDS. The only directories within the root directory are the
`Projects` and `General` directories. Descriptions and organization of those directories are below.
### Projects Directory
This directory is the space for storing all "project" related data. The term `Project` is meant to encompass a wide
range of activities executed in the lab. A project is not defined by the amount of time, size, space, or work it will
involve. A project is used to describe and organize information about a topic of work, research, or development. For
example the following all define projects in the lab (this is not an exhaustive list):
- application development
- pipeline development
- grant funded research
- hypothesis driven research
- Standard Operating Procedures
#### Projects Directory Structure Guidelines
Under this directory there should be only other directories. Each directory should represent a single project, or a host
of projects with common theme. For instance, above, the CF Projects directory contains several directories of projects
all related to CF under it. Either way of representing your project is fine as long as it sticks to this paradigm and
suits your needs.