diff --git a/.bashrc b/.bashrc new file mode 100644 index 00000000..0439f70f --- /dev/null +++ b/.bashrc @@ -0,0 +1,2 @@ +source /usr/share/modules/init/bash +module use /modules/gsi/modulator/modulefiles/Ubuntu18.04 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..3fc1ca7d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +FROM modulator:latest + +MAINTAINER Fenglin Chen + +# packages should already be set up in modulator:latest +USER root + +# move in the yaml to build modulefiles from +COPY recipes/sequenza_recipe.yaml /modulator/code/gsi/recipe.yaml + +# build the modules and set folder / file permissions +RUN ./build-local-code /modulator/code/gsi/recipe.yaml --initsh /usr/share/modules/init/sh --output /modules && \ + find /modules -type d -exec chmod 777 {} \; && \ + find /modules -type f -exec chmod 777 {} \; + +# install required packages +RUN apt-get -m update && apt-get install -y gzip zip unzip + +# add the user +RUN groupadd -r -g 1000 ubuntu && useradd -r -g ubuntu -u 1000 ubuntu +USER ubuntu + +# copy the setup file to load the modules at startup +COPY .bashrc /home/ubuntu/.bashrc + +# set environment paths for modules +#ENV BIOCONDUCTOR_ROOT="/modules/gsi/modulator/sw/Ubuntu18.04/bioconductor-3.8-rstats3.6" +#ENV RSTATS_ROOT="/modules/gsi/modulator/sw/Ubuntu18.04/rstats-3.6" +#ENV SEQUENZA_ROOT="/modules/gsi/modulator/sw/Ubuntu18.04/sequenza-2.1.2" +#ENV SEQUENZA_RES_ROOT="/modules/gsi/modulator/sw/Ubuntu18.04/sequenza-res-2.1.2" +#ENV SEQUENZA_SCRIPTS_ROOT="/modules/gsi/modulator/sw/Ubuntu18.04/sequenza-scripts-2.1.2" + +#ENV PATH="/modules/gsi/modulator/sw/Ubuntu18.04/rstats-3.6/bin:/modules/gsi/modulator/sw/Ubuntu18.04/sequenza-scripts-2.1.2/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" +#ENV MANPATH="/modules/gsi/modulator/sw/Ubuntu18.04/rstats-3.6/share/man" +#ENV LD_LIBRARY_PATH="/modules/gsi/modulator/sw/Ubuntu18.04/rstats-3.6/lib:/modules/gsi/modulator/sw/Ubuntu18.04/sequenza-2.1.2/lib:/modules/gsi/modulator/sw/Ubuntu18.04/sequenza-scripts-2.1.2/lib:/modules/gsi/modulator/sw/Ubuntu18.04/bioconductor-3.8-rstats3.6/lib" +#ENV R_LIBS_SITE="/modules/gsi/modulator/sw/Ubuntu18.04/rstats-3.6/lib/R/library:/modules/gsi/modulator/sw/Ubuntu18.04/sequenza-2.1.2/lib/R/library:/modules/gsi/modulator/sw/Ubuntu18.04/sequenza-scripts-2.1.2/lib/R/library:/modules/gsi/modulator/sw/Ubuntu18.04/bioconductor-3.8-rstats3.6/lib/R/library" + +CMD /bin/bash diff --git a/Dockstore_README.md b/Dockstore_README.md new file mode 100644 index 00000000..d92da979 --- /dev/null +++ b/Dockstore_README.md @@ -0,0 +1,82 @@ +# dockstore_sequenza + +The workflow is made to run in Docker and uploaded to [Dockstore](https://docs.dockstore.org/en/develop/getting-started/getting-started.html). +You can find OICR's Dockstore page [here](https://dockstore.org/organizations/OICR). +The Docker container is based on [Modulator](https://gitlab.oicr.on.ca/ResearchIT/modulator), which builds environment modules to set up the docker runtime environment. + +### Set Up and Run +Currently, this WDL must be run with Cromwell. +It uses Cromwell configuration files to mount a directory to the docker container. +The directory contains data modules built with Modulator, which the WDL tasks need to access. +In addition, you must obtain run files locally and build data modules to a local directory. + +#### 1. Build Data Modules +- Create a local `data_modules/` directory to store the data modules + - make sure you have enough disk space + - each data module could be 5-30 GB in size +- In future iterations of the workflow, this process will be simplified +- Enter the container: +``` +# Mount this repository as /pipeline/; mount the data module destination directory as /data_modules/ +docker run -it --rm -v [PWD]:/pipeline -v [data_modules]:/data_modules [CONTAINER ID (find in options.json)] + +# Copy prerequisite code module YAMLs into the Modulator code directory (code/gsi/) +cp /pipeline/recipes/sequenza_data_modules_prep.yaml code/gsi/data_modules_recipe_prep.yaml + +# Build the prerequisite code modules +./build-local-code code/gsi/data_modules_recipe_prep.yaml --output /data_modules --initsh /usr/share/modules/init/sh + +# Copy data module YAMLs into the Modulator data directory (data/gsi/) +cp /pipeline/recipes/sequenza_data_modules.yaml data/gsi/data_modules_recipe.yaml + +# Build the data modules +./build-local-data data/gsi/data_modules_recipe.yaml --output /data_modules --initsh /usr/share/modules/init/sh + +# Change resulting file permissions +find /data_modules/ -type d -exec chmod 777 {} \; && \ +find /data_modules/ -type f -exec chmod 777 {} \; + +# /data_modules/ should now contain gsi/modulator/modulefiles/Ubuntu18.04/ and gsi/modulator/modulefiles/data/ +``` +For run directories that are not part of modules, copy them from UGE's archive at `/.mounts/labs/gsi/src/` + +#### 2. Obtain Files Locally +In the test json, change file paths like so: +- File type files should be copied to local + - E.g. use scp to copy from UGE + - In the json, change the file path from UGE to local path +- String type files should be copied or moved to the mounted directory, if it's not already part of a module + - In the json, change the file path to how the file would be accessed from inside the docker container +- $MODULE_ROOT paths can stay the same +``` +# File type files +# File is copied to local machine +UGE: "/.mounts/labs/gsi/testdata/wgsPipeline/input_data/wgsPipeline_test_pcsi/hg19_random.genome.sizes.bed" +Dockstore: "/home/ubuntu/data/sample_data/callability/hg19_random.genome.sizes.bed" + +# String type files +# /data_modules/ is a directory mounted to the docker container +UGE: "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.fa" +Dockstore: "/data_modules/gsi/modulator/sw/data/hg19-p13/hg19_random.fa" + +# Root type paths +# The value of $MODULE_ROOT changes, but the path stays the same +UGE: "$HG19_BWA_INDEX_ROOT/hg19_random.fa" +Dockstore: "$HG19_BWA_INDEX_ROOT/hg19_random.fa" +``` + +#### 3. Run with Cromwell +Submit the preprocessed subworkflow and modified json to Cromwell, with configs and options attached +``` +# Validate the wrapper workflow and json +java -jar $womtool validate [WDL] --inputs [TEST JSON] + +# For example: +java -jar $womtool validate wgsPipeline.wdl --inputs tests/wgsPipeline_test_cre_uge.json + +# Submit to Cromwell +java -Dconfig.file=[CONFIG] -jar $cromwell run [WRAPPER WDL] --inputs [JSON] --options [OPTIONS] + +# For example: +java -Dconfig.file=local.config -jar $cromwell run wgsPipeline.wdl --inputs tests/wgsPipeline_test_cre.json --options options.json +``` \ No newline at end of file diff --git a/local.config b/local.config new file mode 100644 index 00000000..960ab1ad --- /dev/null +++ b/local.config @@ -0,0 +1,49 @@ +backend { + default = "Local" + providers { + Local { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + concurrent-job-limit = 10 + #run-in-background = true + runtime-attributes = """ + String? docker + String? docker_volume + String? modules + """ + submit = "/usr/bin/env bash ${script}" + submit-docker = """ + docker run \ + --rm -i \ + -v ${cwd}:${docker_cwd} \ + ${"-v " + docker_volume} \ + ${docker} /bin/bash -c 'source /home/ubuntu/.bashrc; ${"module load " + modules + " || exit 1; "} /bin/bash ${docker_script}' + """ + root = "cromwell-executions" + dockerRoot = "/cromwell-executions" + } + } + } +} +call-caching { + enabled = true + invalidate-bad-cache-results = true +} +database { + profile = "slick.jdbc.HsqldbProfile$" + db { + driver = "org.hsqldb.jdbcDriver" + url = """ + jdbc:hsqldb:file:/tmp/cromwell-executions/cromwell-db/cromwell-db; + shutdown=false; + hsqldb.default_table_type=cached;hsqldb.tx=mvcc; + hsqldb.result_max_memory_rows=10000; + hsqldb.large_data=true; + hsqldb.applog=1; + hsqldb.lob_compressed=true; + hsqldb.script_format=3 + """ + connectionTimeout = 120000 + numThreads = 2 + } +} diff --git a/options.json b/options.json new file mode 100644 index 00000000..9961d618 --- /dev/null +++ b/options.json @@ -0,0 +1,6 @@ +{ + "default_runtime_attributes": { + "docker": "g3chen/sequenza@sha256:f6ebd82705ecc3bdf5f2fead408333ae60492f36b23259d0730d6c70fc7fbb00", + "docker_volume": "/home/ubuntu/Downloads/sample_data:/data" + } +} diff --git a/recipes/sequenza_data_modules.yaml b/recipes/sequenza_data_modules.yaml new file mode 100644 index 00000000..012f8fb7 --- /dev/null +++ b/recipes/sequenza_data_modules.yaml @@ -0,0 +1,19 @@ +# sequenza-res/2.1.2 +- name: sequenza-res + version: 2.1.2 + build_type: custom + build_args: + prereq_type: download + prereq_args: + sha256: b6604cf111849db6997c4ab0fc618f233fd63fc87a6381491470dfa6b1243bdc + url: http://api.gdc.cancer.gov/data/dea893cd-9189-4091-9611-e761a1d31ebe + steps: + - command: [ "Rscript", "{sequenza-scripts_root}/bin/Generate_plody_priors_table.R", "download" ] + - command: [ "cp", "PANCAN_ASCAT_ploidy_prob.Rdata", "{output_dir}/" ] + depends: + - name: sequenza-scripts + version: 2.1.2 + required_at: build + - name: rstats + version: 3.6 + required_at: build \ No newline at end of file diff --git a/recipes/sequenza_data_modules_prep.yaml b/recipes/sequenza_data_modules_prep.yaml new file mode 100644 index 00000000..141af129 --- /dev/null +++ b/recipes/sequenza_data_modules_prep.yaml @@ -0,0 +1,62 @@ +# rstats/3.6 +- name: rstats + version: 3.6 + build_type: custom + build_args: + steps: + - build_args: + prereq_args: + prereq_args: + md5: f5003472d58a3d3765a1c537fdae71d5 + url: http://cran.utstat.utoronto.ca/src/base/R-3/R-3.6.1.tar.gz + prereq_type: download + prereq_type: extract + configure: ["--with-x=no"] + build_type: autotools + - build_args: + package: devtools + # version: don't set, devtools is needed for versioned cran installs + build_type: cran + scan_dirs: true + system_depends: + - name: libcurl4-openssl-dev + - name: libbz2-dev + - name: libncurses5-dev + - name: libreadline-dev + permitted_os: ["Ubuntu18.04"] + - name: libreadline6-dev + permitted_os: ["Debian8.11"] + - name: liblzma-dev + - name: libpcre3-dev + - name: gfortran + required_at: build + - name: libgfortran-7-dev + permitted_os: ["Ubuntu18.04"] + - name: libgfortran-4.9-dev + permitted_os: ["Debian8.11"] + +# sequenza-scripts/2.1.2 +- name: sequenza-scripts + version: 2.1.2 + build_type: custom + build_args: + prereq_args: + prereq_args: + sha256: aec68d6f47d3084a65ef5accbb12c8e70eeffb35ae04b371413cdf8323e34f42 + url: https://github.com/oicr-gsi/sequenza/archive/2.1.2.tar.gz + prereq_type: download + prereq_type: extract + steps: + - build_type: cran + build_args: + package: openxlsx + - build_type: copy + build_args: + from: + - Generate_plody_priors_table.R + - SequenzaPreProcess_v2.2.R + - SequenzaProcess_v2.2.R + to: bin/ + depends: + - name: rstats + version: 3.6 diff --git a/recipes/sequenza_recipe.yaml b/recipes/sequenza_recipe.yaml new file mode 100644 index 00000000..48e8d2ee --- /dev/null +++ b/recipes/sequenza_recipe.yaml @@ -0,0 +1,116 @@ +# rstats/3.6 +- name: rstats + version: 3.6 + build_type: custom + build_args: + steps: + - build_args: + prereq_args: + prereq_args: + md5: f5003472d58a3d3765a1c537fdae71d5 + url: http://cran.utstat.utoronto.ca/src/base/R-3/R-3.6.1.tar.gz + prereq_type: download + prereq_type: extract + configure: ["--with-x=no"] + build_type: autotools + - build_args: + package: devtools + # version: don't set, devtools is needed for versioned cran installs + build_type: cran + scan_dirs: true + system_depends: + - name: libcurl4-openssl-dev + - name: libbz2-dev + - name: libncurses5-dev + - name: libreadline-dev + permitted_os: ["Ubuntu18.04"] + - name: libreadline6-dev + permitted_os: ["Debian8.11"] + - name: liblzma-dev + - name: libpcre3-dev + - name: gfortran + required_at: build + - name: libgfortran-7-dev + permitted_os: ["Ubuntu18.04"] + - name: libgfortran-4.9-dev + permitted_os: ["Debian8.11"] + +# bioconductor/3.8-rstats3.6 +- name: bioconductor + version: 3.8-rstats3.6 + build_type: cran + build_args: + package: BiocManager + version: 1.30.4 + depends: + - name: rstats + version: 3.6 + +# sequenza/2.1.2 +- name: sequenza + version: 2.1.2 + build_type: custom + build_args: + steps: + - build_type: bioc + build_args: + package: copynumber + - build_type: cran + build_args: + package: optparse + - build_type: cran + build_args: + package: sequenza + version: 2.1.2 + depends: + - name: rstats + version: 3.6 + - name: bioconductor + version: 3.8-rstats3.6 + required_at: build + +# sequenza-scripts/2.1.2 +- name: sequenza-scripts + version: 2.1.2 + build_type: custom + build_args: + prereq_args: + prereq_args: + sha256: 493b8ce6ae2397fbbc8f946a76cb5903e37ba224e9dfae0531f9d0f82ba86e93 + url: https://github.com/oicr-gsi/sequenza/archive/v1.1.tar.gz + prereq_type: download + prereq_type: extract + steps: + - build_type: cran + build_args: + package: openxlsx + - build_type: copy + build_args: + from: + - Generate_plody_priors_table.R + - SequenzaPreProcess_v2.2.R + - SequenzaProcess_v2.2.R + to: bin/ + depends: + - name: rstats + version: 3.6 + +# sequenza-res/2.1.2 +- name: sequenza-res + version: 2.1.2 + build_type: custom + build_args: + prereq_type: download + prereq_args: + sha256: b6604cf111849db6997c4ab0fc618f233fd63fc87a6381491470dfa6b1243bdc + url: http://api.gdc.cancer.gov/data/dea893cd-9189-4091-9611-e761a1d31ebe + steps: + - command: [ "Rscript", "{sequenza-scripts_root}/bin/Generate_plody_priors_table.R", "download" ] + - command: [ "cp", "PANCAN_ASCAT_ploidy_prob.Rdata", "{output_dir}/" ] + depends: + - name: sequenza-scripts + version: 2.1.2 + required_at: build + - name: rstats + version: 3.6 + required_at: build