version development ## Copyright (c) 2021 Giulio Genovese ## ## Version 2021-10-15 ## ## Contact Giulio Genovese ## ## This WDL workflow runs allelic shift imbalance analysis in a given region ## ## Cromwell version support ## - Successfully tested on v70 ## ## Distributed under terms of the MIT License struct Reference { File? cyto_file String? chr_prefix Array[Int] len Array[Int] pcen Array[Int] qcen } workflow shift { input { String sample_set_id File? remove_samples_file File pheno_tsv_file String as_id = "AS" String ext_string = "as" String ref_name = "GRCh38" String? ref_path String? chr_prefix File? cyto_file String? data_path File batch_tsv_file # batch_id path chr1_imp_vcf chr1_imp_vcf_index chr2_imp_vcf chr2_imp_vcf_index ... Boolean fisher_exact = true Boolean drop_genotypes = true Boolean phred_score = true Boolean plot = true String basic_bash_docker = "debian:stable-slim" String docker_repository = "us.gcr.io/mccarroll-mocha" String bcftools_docker = "bcftools:1.13-20211015" String r_mocha_docker = "r_mocha:1.13-20211015" } String docker_repository_with_sep = docker_repository + if docker_repository != "" && docker_repository == sub(docker_repository, "/$", "") then "/" else "" String ref_path_with_sep = select_first([ref_path, ""]) + if defined(ref_path) && select_first([ref_path]) == sub(select_first([ref_path]), "/$", "") then "/" else "" Reference ref = object { cyto_file: if defined(ref_path) || defined(cyto_file) then ref_path_with_sep + select_first([cyto_file, "cytoBand.txt.gz"]) else None, chr_prefix: if ref_name == "GRCh38" then "chr" else if ref_name == "GRCh37" then "" else chr_prefix, len: if ref_name == "GRCh38" then [248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468, 156040895] else if ref_name == "GRCh37" then [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 155270560] else None, pcen: if ref_name == "GRCh38" then [122026459, 92188145, 90772458, 49712061, 46485900, 58553888, 58169653, 44033744, 43389635, 39686682, 51078348, 34769407, 16000000, 16000000, 17083673, 36311158, 22813679, 15460899, 24498980, 26436232, 10864560, 12954788, 58605579] else if ref_name == "GRCh37" then [121535434, 92326171, 90504854, 49660117, 46405641, 58830166, 58054331, 43838887, 47367679, 39254935, 51644205, 34856694, 16000000, 16000000, 17000000, 35335801, 22263006, 15460898, 24681782, 26369569, 11288129, 13000000, 58632012] else None, qcen: if ref_name == "GRCh38" then [124932724, 94090557, 93655574, 51743951, 50059807, 59829934, 61528020, 45877265, 45518558, 41593521, 54425074, 37185252, 18051248, 18173523, 19725254, 38265669, 26616164, 20861206, 27190874, 30038348, 12915808, 15054318, 62412542] else if ref_name == "GRCh37" then [124535434, 95326171, 93504854, 52660117, 49405641, 61830166, 61054331, 46838887, 50367679, 42254935, 54644205, 37856694, 19000000, 19000000, 20000000, 38335801, 25263006, 18460898, 27681782, 29369569, 14288129, 16000000, 61632012] else None } # read table with batches information (scatter could be avoided if there was a tail() function) Array[Array[String]] batch_tsv = read_tsv(batch_tsv_file) Int n_batches = length(batch_tsv)-1 scatter (idx in range(n_batches)) { Array[String] batch_tsv_rows = batch_tsv[(idx+1)] } Map[String, Array[String]] batch_tbl = as_map(zip(batch_tsv[0], transpose(batch_tsv_rows))) # compute data paths for each batch, if available (scatter could be avoided if there was a contains_key() function) scatter (key in keys(batch_tbl)) { Boolean? is_key_equal_path = if key == "path" then true else None } scatter (idx in range(n_batches)) { String data_paths = select_first([data_path, if length(select_all(is_key_equal_path))>0 then batch_tbl["path"][idx] else ""]) String data_paths_with_sep = data_paths + (if data_paths == "" || sub(data_paths, "/$", "") != data_paths then "" else "/") } call lst_header { input: pheno_tsv_file = pheno_tsv_file, docker = basic_bash_docker } # compute phenotype regions to test scatter (pheno in lst_header.phenos) { String region_name = sub(pheno, "_.*$", "") String chr_string = sub(sub(region_name, "[pq]*$", ""), "Y", "X") Int chr_idx = sub(chr_string, "^X$", "23") String arm = sub(region_name, "^[0-9XY]*", "") String pheno_regions = ref.chr_prefix + chr_string + if arm == "p" then ":1-" + ref.pcen[(chr_idx - 1)] else if arm == "q" then ":" + ref.qcen[(chr_idx - 1)] + "-" + ref.len[(chr_idx - 1)] else "" } # scatter target genotypes scatter (p in cross(range(n_batches), range(length(lst_header.phenos)))) { String cross_idx = p.right call vcf_summary { input: vcf_file = data_paths_with_sep[p.left] + batch_tbl[("chr" + chr_string[p.right] + "_imp_vcf")][p.left], vcf_idx = data_paths_with_sep[p.left] + batch_tbl[("chr" + chr_string[p.right] + "_imp_vcf_index")][p.left], pheno_name = lst_header.phenos[p.right], region = pheno_regions[p.right], remove_samples_file = remove_samples_file, pheno_tsv_file = pheno_tsv_file, fisher_exact = fisher_exact, as_id = as_id, ext_string = ext_string, drop_genotypes = drop_genotypes, docker = docker_repository_with_sep + bcftools_docker } } scatter (idx in range(length(lst_header.phenos))) { Map[Int, Array[File]] idx2vcf_files = collect_by_key(zip(cross_idx, vcf_summary.as_vcf_file)) Map[Int, Array[File]] idx2vcf_idxs = collect_by_key(zip(cross_idx, vcf_summary.as_vcf_idx)) call vcf_merge { input: vcf_files = idx2vcf_files[idx], vcf_idxs = idx2vcf_idxs[idx], as_id = as_id, fisher_exact = fisher_exact, ext_string = ext_string, filebase = sample_set_id + "." + lst_header.phenos[idx], phred_score = phred_score, docker = docker_repository_with_sep + bcftools_docker } if (plot) { call shift_plot { input: vcf_file = vcf_merge.as_vcf_file, vcf_idx = vcf_merge.as_vcf_idx, region = pheno_regions[idx], cyto_file = ref.cyto_file, filebase = sample_set_id + "." + lst_header.phenos[idx], docker = docker_repository_with_sep + r_mocha_docker } } } output { Array[File] vcf_files = vcf_merge.as_vcf_file Array[File] vcf_idxs = vcf_merge.as_vcf_idx Array[File?] png_files = shift_plot.png_file } } task lst_header { input { File pheno_tsv_file String space_character = '_' String docker Int cpu = 1 Int disk_size = 10 Float memory = 3.5 Int preemptible = 1 Int maxRetries = 0 } command <<< set -euo pipefail mv "~{pheno_tsv_file}" . head -n1 "~{basename(pheno_tsv_file)}" | cut -f2- | tr '\t ' '\n~{space_character}' rm "~{basename(pheno_tsv_file)}" >>> output { Array[String] phenos = read_lines(stdout()) } runtime { docker: docker cpu: cpu disks: "local-disk " + disk_size + " HDD" memory: memory + " GiB" preemptible: preemptible maxRetries: maxRetries } } # for this task BCFtools 1.14 is needed https://github.com/samtools/bcftools/issues/1566 task vcf_summary { input { File vcf_file File vcf_idx String pheno_name String region File? remove_samples_file File pheno_tsv_file Boolean fisher_exact String as_id String ext_string Boolean drop_genotypes = true String docker Int cpu = 1 Int? disk_size_override Float memory = 3.5 Int preemptible = 1 Int maxRetries = 0 } Float vcf_size = size(vcf_file, "GiB") Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * vcf_size)]) String filebase = basename(basename(basename(basename(vcf_file, ".bcf"), ".vcf.gz"), ".as"), ".chr" + sub(sub(region, ":.*$", ""), "^chr", "")) + '.' + pheno_name command <<< set -euo pipefail echo "~{sep="\n" select_all([vcf_file, vcf_idx, remove_samples_file, pheno_tsv_file])}" | \ tr '\n' '\0' | xargs -0 mv -t . ~{if fisher_exact then "awk -F\"\\t\" 'NR==1 {for (i=1; i<=NF; i++) f[$i] = i}\n" + " NR>1 && $(f[\"" + pheno_name + "\"])==0 {print $(f[\"sample_id\"])}' \"" + basename(pheno_tsv_file) + "\" > \"" + filebase + ".controls.lines\"" else ""} awk -F"\t" 'NR==1 {for (i=1; i<=NF; i++) f[$i] = i} NR>1 && $(f["~{pheno_name}"])==1 {print $(f["sample_id"])}' "~{basename(pheno_tsv_file)}" > "~{filebase}.cases.lines" bcftools annotate \ --no-version \ --output-type u \ --regions "~{region}" \ --remove ID,QUAL,FILTER,INFO,^FMT/GT,FMT/~{as_id} \ "~{basename(vcf_file)}" | \ ~{if defined(remove_samples_file) then "bcftools view \\\n" + " --no-version \\\n" + " --output-type u \\\n" + " --samples-file \"^" + basename(select_first([remove_samples_file])) + "\" \\\n" + " --force-samples |" else ""} \ ~{if fisher_exact then "bcftools +contrast \\\n" + " --output-type u \\\n" + " --annots NASSOC \\\n" + " --control-samples \"" + filebase + ".controls.lines\" \\\n" + " --case-samples \"" + filebase + ".cases.lines\" \\\n" + " --force-samples |" else ""} \ bcftools +mochatools \ --no-version \ --output-type b \ -- --summary ~{as_id} \ --samples-file "~{filebase}.cases.lines" \ --force-samples \ ~{if drop_genotypes then "--drop-genotypes" else ""} | \ tee "~{filebase}.~{ext_string}.bcf" | \ bcftools index --force --output "~{filebase}.~{ext_string}.bcf.csi" rm~{if fisher_exact then " \"" + filebase + ".controls.lines\"" else ""} "~{filebase}.cases.lines" echo "~{sep="\n" select_all([vcf_file, vcf_idx, remove_samples_file, pheno_tsv_file])}" | \ sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm >>> output { File as_vcf_file = filebase + "." + ext_string + ".bcf" File as_vcf_idx = filebase + "." + ext_string + ".bcf.csi" } runtime { memory: memory + " GiB" disks: "local-disk " + disk_size + " HDD" cpu: cpu docker: docker preemptible: preemptible maxRetries: maxRetries } } task vcf_merge { input { Array[File] vcf_files Array[File] vcf_idxs Boolean fisher_exact String as_id String ext_string String filebase Boolean phred_score = true String docker Int cpu = 1 Int? disk_size_override Float memory = 3.5 Int preemptible = 1 Int maxRetries = 0 } Float vcf_size = size(vcf_files, "GiB") Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * vcf_size)]) command <<< set -euo pipefail vcf_files=~{write_lines(vcf_files)} vcf_idxs=~{write_lines(vcf_idxs)} cat $vcf_files $vcf_idxs | tr '\n' '\0' | xargs -0 mv -t . sed -i 's/^.*\///' $vcf_files $vcf_idxs bcftools merge \ --no-version \ --output-type u \ --info-rules ~{if fisher_exact then "NASSOC:sum," else ""}~{as_id}:sum \ --file-list $vcf_files \ --merge none | \ ~{if fisher_exact then "bcftools +mochatools \\\n" + " --no-version \\\n" + " --output-type u \\\n" + " -- --test NASSOC \\\n" + (if phred_score then " --phred" else "") + " |" else ""} \ bcftools +mochatools \ --no-version \ --output-type b \ -- --test ~{as_id} \ ~{if phred_score then "--phred" else ""} | \ tee "~{filebase}.~{ext_string}.bcf" | \ bcftools index --force --output "~{filebase}.~{ext_string}.bcf.csi" cat $vcf_files $vcf_idxs | tr '\n' '\0' | xargs -0 rm >>> output { File as_vcf_file = filebase + "." + ext_string + ".bcf" File as_vcf_idx = filebase + "." + ext_string + ".bcf.csi" } runtime { memory: memory + " GiB" disks: "local-disk " + disk_size + " HDD" cpu: cpu docker: docker preemptible: preemptible maxRetries: maxRetries } } task shift_plot { input { File vcf_file File vcf_idx String region File? cyto_file String filebase String docker Int cpu = 1 Int? disk_size_override Float memory = 3.5 Int preemptible = 1 Int maxRetries = 0 } Float vcf_size = size(vcf_file, "GiB") Int disk_size = select_first([disk_size_override, ceil(10.0 + vcf_size)]) command <<< set -euo pipefail mv "~{vcf_file}" . mv "~{vcf_idx}" . ~{if defined(cyto_file) then "mv \"" + select_first([cyto_file]) + "\" ." else ""} shift_plot.R \ ~{if defined(cyto_file) then "--cytoband \"" + basename(select_first([cyto_file])) + "\"" else ""} \ --vcf "~{basename(vcf_file)}" \ --region ~{region} \ --png "~{filebase}.png" rm "~{basename(vcf_file)}" rm "~{basename(vcf_idx)}" ~{if defined(cyto_file) then "rm \"" + basename(select_first([cyto_file])) + "\"" else ""} >>> output { File png_file = filebase + '.png' } runtime { docker: docker cpu: cpu disks: "local-disk " + disk_size + " HDD" memory: memory + " GiB" preemptible: preemptible maxRetries: maxRetries } }