version development ## Copyright (c) 2021-2025 Giulio Genovese ## ## Version 2025-08-19 ## ## Contact Giulio Genovese ## ## This WDL workflow runs allelic shift imbalance analysis in a given region ## ## Cromwell version support ## - Successfully tested on v90 ## ## Distributed under terms of the MIT License struct Reference { File fasta File fasta_fai String n_x_chr File? cyto_file String? chr_prefix Array[Int] len Array[Int] pcen Array[Int] qcen File? gff3_file # http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/ File? rsid_vcf_file # http://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF/ File? rsid_vcf_idx } workflow shift { input { String sample_set_id File? keep_samples_file File? remove_samples_file File pheno_tsv_file String as_id = "AS" String ext_string = "as" String? pop String ref_name = "GRCh38" String? ref_fasta String? ref_fasta_fai String? ref_path String? chr_prefix Int? n_x_chr String? cyto_file String? gff3_file String? rsid_vcf_file String? rsid_vcf_idx File impute_tsv_file # batch_id path chr1_imp_vcf chr1_imp_vcf_index chr2_imp_vcf chr2_imp_vcf_index ... String? impute_data_path Boolean fisher_exact = true Boolean drop_genotypes = true Boolean phred_score = true Boolean plot = true String basic_bash_docker = "debian:stable-slim" String docker_repository = "us.gcr.io/mccarroll-mocha" String bcftools_docker = "bcftools:1.22-20250819" String r_mocha_docker = "r_mocha:1.22-20250819" } String docker_repository_with_sep = docker_repository + if docker_repository != "" && docker_repository == sub(docker_repository, "/$", "") then "/" else "" String ref_path_with_sep = select_first([ref_path, ""]) + if defined(ref_path) && select_first([ref_path]) == sub(select_first([ref_path]), "/$", "") then "/" else "" Reference ref = object { fasta: if defined(ref_fasta) then ref_path_with_sep + select_first([ref_fasta]) else if ref_name == "GRCh38" then ref_path_with_sep + "GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" else if ref_name == "GRCh37" then ref_path_with_sep + "human_g1k_v37.fasta" else None, fasta_fai: if defined(ref_fasta_fai) then ref_path_with_sep + select_first([ref_fasta_fai]) else if ref_name == "GRCh38" then ref_path_with_sep + "GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai" else if ref_name == "GRCh37" then ref_path_with_sep + "human_g1k_v37.fasta.fai" else None, n_x_chr: select_first([n_x_chr, 23]), cyto_file: if defined(ref_path) || defined(cyto_file) then ref_path_with_sep + select_first([cyto_file, "cytoBand.txt.gz"]) else None, chr_prefix: if ref_name == "GRCh38" then "chr" else if ref_name == "GRCh37" then "" else chr_prefix, len: if ref_name == "GRCh38" then [248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468, 156040895] else if ref_name == "GRCh37" then [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 155270560] else None, pcen: if ref_name == "GRCh38" then [122026459, 92188145, 90772458, 49712061, 46485900, 58553888, 58169653, 44033744, 43389635, 39686682, 51078348, 34769407, 16000000, 16000000, 17083673, 36311158, 22813679, 15460899, 24498980, 26436232, 10864560, 12954788, 58605579] else if ref_name == "GRCh37" then [121535434, 92326171, 90504854, 49660117, 46405641, 58830166, 58054331, 43838887, 47367679, 39254935, 51644205, 34856694, 16000000, 16000000, 17000000, 35335801, 22263006, 15460898, 24681782, 26369569, 11288129, 13000000, 58632012] else None, qcen: if ref_name == "GRCh38" then [124932724, 94090557, 93655574, 51743951, 50059807, 59829934, 61528020, 45877265, 45518558, 41593521, 54425074, 37185252, 18051248, 18173523, 19725254, 38265669, 26616164, 20861206, 27190874, 30038348, 12915808, 15054318, 62412542] else if ref_name == "GRCh37" then [124535434, 95326171, 93504854, 52660117, 49405641, 61830166, 61054331, 46838887, 50367679, 42254935, 54644205, 37856694, 19000000, 19000000, 20000000, 38335801, 25263006, 18460898, 27681782, 29369569, 14288129, 16000000, 61632012] else None, gff3_file: if defined(gff3_file) then ref_path_with_sep + select_first([gff3_file]) else None, rsid_vcf_file: if defined(rsid_vcf_file) then ref_path_with_sep + select_first([rsid_vcf_file]) else None, rsid_vcf_idx: if defined(rsid_vcf_idx) then ref_path_with_sep + select_first([rsid_vcf_idx]) else None } # read table with batches information (scatter could be avoided if there was a tail() function) Array[Array[String]] impute_tsv = read_tsv(impute_tsv_file) Int n_batches = length(impute_tsv)-1 scatter (idx in range(n_batches)) { Array[String] impute_tsv_rows = impute_tsv[(idx+1)] } Map[String, Array[String]] impute_tbl = as_map(zip(impute_tsv[0], transpose(impute_tsv_rows))) # check if path is in impute table (see http://github.com/openwdl/wdl/issues/305) Boolean is_path_in_impute_tbl = length(collect_by_key(zip(flatten([keys(impute_tbl),["path"]]),range(length(keys(impute_tbl))+1)))["path"])>1 # compute data paths for each batch scatter (idx in range(n_batches)) { String impute_data_paths = select_first([impute_data_path, if is_path_in_impute_tbl then impute_tbl["path"][idx] else ""]) String impute_data_paths_with_sep = impute_data_paths + (if impute_data_paths == "" || sub(impute_data_paths, "/$", "") != impute_data_paths then "" else "/") } call lst_header { input: pheno_tsv_file = pheno_tsv_file, docker = basic_bash_docker } # compute phenotype regions to test scatter (idx in range(length(lst_header.phenos))) { String region_name = sub(lst_header.phenos[idx], "_.*$", "") String chr_string = sub(sub(region_name, "[pq]*$", ""), "Y", "X") if (sub(chr_string, "[0-9X]+", "") == "") { Int pheno_idx = idx String pheno_chr = chr_string String pheno_name = lst_header.phenos[idx] String vcf_file_suffix = pheno_name + "." + ext_string + ".bcf" # http://github.com/broadinstitute/cromwell/issues/5549 String vcf_idx_suffix = pheno_name + "." + ext_string + ".bcf.csi" # http://github.com/broadinstitute/cromwell/issues/5549 Int chr_idx = sub(chr_string, "^X$", ref.n_x_chr) String arm = sub(region_name, "^[0-9XY]*", "") String pheno_regions = ref.chr_prefix + chr_string + if arm == "p" then ":1-" + ref.pcen[(chr_idx - 1)] else if arm == "q" then ":" + ref.qcen[(chr_idx - 1)] + "-" + ref.len[(chr_idx - 1)] else "" } } # generate list of all chromosomes to be processed Map[String, Array[String]] chr2pheno_idx = collect_by_key(zip(select_all(pheno_chr), select_all(pheno_idx))) Map[String, Array[String]] chr2pheno_names = collect_by_key(zip(select_all(pheno_chr), select_all(pheno_name))) Map[String, Array[String]] chr2regions = collect_by_key(zip(select_all(pheno_chr), select_all(pheno_regions))) Map[String, Array[String]] chr2vcf_file_suffix = collect_by_key(zip(select_all(pheno_chr), select_all(vcf_file_suffix))) # http://github.com/broadinstitute/cromwell/issues/5549 Map[String, Array[String]] chr2vcf_idx_suffix = collect_by_key(zip(select_all(pheno_chr), select_all(vcf_idx_suffix))) # http://github.com/broadinstitute/cromwell/issues/5549 scatter (p in cross(range(n_batches), keys(chr2pheno_names))) { Array[Int] cross_idx = chr2pheno_idx[p.right] call vcf_summary { input: vcf_file = impute_data_paths_with_sep[p.left] + impute_tbl[("chr" + p.right + "_imp_vcf")][p.left], vcf_idx = impute_data_paths_with_sep[p.left] + impute_tbl[("chr" + p.right + "_imp_vcf_index")][p.left], pheno_names = chr2pheno_names[p.right], regions = chr2regions[p.right], vcf_file_suffix = chr2vcf_file_suffix[p.right], # http://github.com/broadinstitute/cromwell/issues/5549 vcf_idx_suffix = chr2vcf_idx_suffix[p.right], # http://github.com/broadinstitute/cromwell/issues/5549 keep_samples_file = keep_samples_file, remove_samples_file = remove_samples_file, pheno_tsv_file = pheno_tsv_file, fisher_exact = fisher_exact, filebase = sample_set_id + "." + p.left, as_id = as_id, ext_string = ext_string, drop_genotypes = drop_genotypes, docker = docker_repository_with_sep + bcftools_docker } } scatter (idx in select_all(pheno_idx)) { Map[Int, Array[File]] idx2vcf_files = collect_by_key(zip(flatten(cross_idx), flatten(vcf_summary.as_vcf_files))) Map[Int, Array[File]] idx2vcf_idxs = collect_by_key(zip(flatten(cross_idx), flatten(vcf_summary.as_vcf_idxs))) call vcf_merge { input: vcf_files = idx2vcf_files[idx], vcf_idxs = idx2vcf_idxs[idx], ref_fasta = if defined(ref.gff3_file) then ref.fasta else None, fasta_fai = ref.fasta_fai, gff3_file = ref.gff3_file, rsid_vcf_file = ref.rsid_vcf_file, rsid_vcf_idx = ref.rsid_vcf_idx, region = select_first([pheno_regions[idx]]), fisher_exact = fisher_exact, as_id = as_id, filebase = sample_set_id + "." + ext_string + "_" + lst_header.phenos[idx] + if defined(pop) then "." + select_first([pop]) else "", pheno_name = ext_string + "_" + lst_header.phenos[idx] + if defined(pop) then "." + select_first([pop]) else "", phred_score = phred_score, docker = docker_repository_with_sep + bcftools_docker } if (plot) { if (vcf_merge.n > 0) { call assoc_plot { input: vcf_file = vcf_merge.gwas_vcf_file, vcf_idx = vcf_merge.gwas_vcf_idx, region = select_first([pheno_regions[idx]]), cyto_file = ref.cyto_file, csq = defined(ref.gff3_file), filebase = sample_set_id + "." + ext_string + "_" + lst_header.phenos[idx] + if defined(pop) then "." + select_first([pop]) else "", docker = docker_repository_with_sep + r_mocha_docker } } } } output { Array[File] gwas_vcf_file = vcf_merge.gwas_vcf_file Array[File] gwas_vcf_idx = vcf_merge.gwas_vcf_idx Array[File]? png_files = if plot then select_all(assoc_plot.png_file) else None } meta { author: "Giulio Genovese" email: "giulio.genovese@gmail.com" description: "See the [MoChA](http://github.com/freeseek/mocha) website for more information" } } task lst_header { input { File pheno_tsv_file String space_character = '_' String docker Int cpu = 1 Int disk_size = 10 Float memory = 3.5 Int preemptible = 1 Int maxRetries = 0 } command <<< set -euo pipefail mv "~{pheno_tsv_file}" . head -n1 "~{basename(pheno_tsv_file)}" | cut -f2- | tr '\t ' '\n~{space_character}' rm "~{basename(pheno_tsv_file)}" >>> output { Array[String] phenos = read_lines(stdout()) } runtime { docker: docker cpu: cpu disks: "local-disk " + disk_size + " HDD" memory: memory + " GiB" preemptible: preemptible maxRetries: maxRetries } } # the command requires BCFtools 1.14 due to bug http://github.com/samtools/bcftools/issues/1566 task vcf_summary { input { File vcf_file File vcf_idx Array[String]+ pheno_names Array[String]+ regions Array[String]+ vcf_file_suffix # suffix array passed due to bug http://github.com/broadinstitute/cromwell/issues/5549 Array[String]+ vcf_idx_suffix # suffix array passed due to bug http://github.com/broadinstitute/cromwell/issues/5549 File? keep_samples_file File? remove_samples_file File pheno_tsv_file Boolean fisher_exact String filebase String as_id String ext_string Boolean drop_genotypes = true String docker Int cpu = 1 Int? disk_size_override Float memory = 3.5 Int preemptible = 1 Int maxRetries = 0 } Float vcf_size = size(vcf_file, "GiB") Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * vcf_size)]) command <<< set -euo pipefail echo "~{sep("\n", select_all([vcf_file, vcf_idx, keep_samples_file, remove_samples_file, pheno_tsv_file]))}" | \ tr '\n' '\0' | xargs -0 mv -t . pheno_names=~{write_lines(pheno_names)} regions=~{write_lines(regions)} paste $pheno_names $regions | while IFS=$'\t' read pheno_name region; do awk -F"\t" -v pheno_name=$pheno_name 'NR==1 {for (i=1; i<=NF; i++) f[$i] = i} NR>1 && ($(f[pheno_name])==0 || $(f[pheno_name])==1) {print $(f["sample_id"])"\t"$(f[pheno_name])}' \ "~{basename(pheno_tsv_file)}" > "~{filebase}.pheno.tsv" cut -f1 "~{filebase}.pheno.tsv"~{if defined(keep_samples_file) then " | \\\n" + "awk -F\"\\t\" 'NR==FNR {x[$1]++} NR>FNR && $1 in x' \"" + basename(select_first([keep_samples_file])) + "\" -" else ""}~{if defined(remove_samples_file) then " | \\\n" + "awk -F\"\\t\" 'NR==FNR {x[$1]++} NR>FNR && !($1 in x)' \"" + basename(select_first([remove_samples_file])) + "\" -" else ""} > "~{filebase}.samples.lines" ~{if fisher_exact then "awk -F\"\\t\" '$2==0 {print $1}' \"" + filebase + ".pheno.tsv\" > \"" + filebase + ".controls.lines\"" else ""} awk -F"\t" '$2==1 {print $1}' "~{filebase}.pheno.tsv" > "~{filebase}.cases.lines" bcftools +mochatools \ --no-version \ --output-type u \ --regions $region \ "~{basename(vcf_file)}" \ -- --tags MACH \ --samples-file "~{filebase}.samples.lines" \ --force-samples | \ bcftools annotate \ --no-version \ --output-type u \ --remove ID,QUAL,FILTER,^INFO/MACH,^FMT/GT,FMT/~{as_id}~{if fisher_exact then " | \\\n" + "bcftools +contrast \\\n" + " --output-type u \\\n" + " --annots NASSOC \\\n" + " --control-samples \"" + filebase + ".controls.lines\" \\\n" + " --case-samples \"" + filebase + ".cases.lines\" \\\n" + " --force-samples" else ""} | \ bcftools +mochatools \ --no-version \ --output-type u \ -- --summary ~{as_id} \ --samples-file "~{filebase}.cases.lines" \ --force-samples \ ~{if drop_genotypes then "--drop-genotypes" else ""} | \ bcftools view --no-version --output "~{filebase}.$pheno_name.~{ext_string}.bcf" --output-type b --include 'sum(~{as_id})>0' --write-index rm "~{filebase}.pheno.tsv" "~{filebase}.samples.lines" ~{if fisher_exact then "\"" + filebase + ".controls.lines\" " else ""}"~{filebase}.cases.lines" done echo "~{sep("\n", select_all([vcf_file, vcf_idx, keep_samples_file, remove_samples_file, pheno_tsv_file]))}" | \ sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm >>> output { Array[File] as_vcf_files = prefix(filebase + ".", vcf_file_suffix) Array[File] as_vcf_idxs = prefix(filebase + ".", vcf_idx_suffix) } runtime { memory: memory + " GiB" disks: "local-disk " + disk_size + " HDD" cpu: cpu docker: docker preemptible: preemptible maxRetries: maxRetries } } # bcftools annotate needs --regions due to bug http://github.com/samtools/bcftools/issues/1199 # see http://github.com/MRCIEU/gwas-vcf-specification for VCF output # see Marchini, J., Howie, B. Genotype imputation for genome-wide association studies. Nat Rev Genet 11, 499–511 (2010). http://doi.org/10.1038/nrg2796 task vcf_merge { input { Array[File]+ vcf_files Array[File]+ vcf_idxs File? ref_fasta File fasta_fai File? gff3_file File? rsid_vcf_file File? rsid_vcf_idx String region Boolean fisher_exact String as_id String filebase String pheno_name Boolean phred_score = true String docker Int cpu = 1 Int? disk_size_override Float memory = 3.5 Int preemptible = 1 Int maxRetries = 0 } Float vcf_size = size(vcf_files, "GiB") Float ref_size = size(ref_fasta, "GiB") Float dbsnp_size = size(rsid_vcf_file, "GiB") Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * vcf_size + ref_size + dbsnp_size)]) command <<< set -euo pipefail vcf_files=~{write_lines(vcf_files)} vcf_idxs=~{write_lines(vcf_idxs)} echo "~{sep("\n", select_all([ref_fasta, fasta_fai, gff3_file, rsid_vcf_file, rsid_vcf_idx]))}" | \ cat - $vcf_files $vcf_idxs | tr '\n' '\0' | xargs -0 mv -t . sed -i 's/^.*\///' $vcf_files $vcf_idxs bcftools merge \ --no-version \ --output-type u \ --force-single \ --info-rules MACH:sum,~{if fisher_exact then "NASSOC:sum," else ""}~{as_id}:sum \ --file-list $vcf_files \ --merge none | \ bcftools reheader \ --fai "~{basename(fasta_fai)}" \ --temp-prefix ./bcftools. | \ ~{if fisher_exact then "bcftools +mochatools \\\n" + " --no-version \\\n" + " --output-type u \\\n" + " -- --test NASSOC \\\n" + (if phred_score then " --phred" else "") + " |" else ""} \ bcftools +mochatools \ --no-version \ --output-type ~{if defined(gff3_file) then "u" else "b"} \ -- --test ~{as_id} \ ~{if phred_score then "--phred" else ""} \ ~{if defined(gff3_file) then " | \\\n" + "bcftools csq \\\n" + " --no-version \\\n" + " --output-type b \\\n" + " --fasta-ref \"" + basename(select_first([ref_fasta])) + "\" \\\n" + " --gff-annot \"" + basename(select_first([gff3_file])) + "\" \\\n" + " --trim-protein-seq 1 \\\n" + " --custom-tag CSQ \\\n" + " --local-csq \\\n" + " --ncsq 64 \\\n" + " --samples -" else ""} \ --output "~{filebase + if defined(rsid_vcf_file) then ".tmp" else ""}.bcf" \ --write-index ~{if defined(rsid_vcf_file) then "bcftools annotate \\\n" + " --no-version \\\n" + " --annotations \"" + basename(select_first([rsid_vcf_file])) + "\" \\\n" + " --columns ID \\\n" + " --output \"" + filebase + ".bcf\" \\\n" + " --output-type b \\\n" + " --regions \"" + region + "\" \\\n" + " \"" + filebase + ".tmp.bcf\" \\\n" + " --write-index\n" + " rm \"" + filebase + ".tmp.bcf\" \"" + filebase + ".tmp.bcf.csi\"" else ""} (echo "CHROM GENPOS ALLELE0 ALLELE1 N INFO N_CASES BETA SE LOG10P A1FREQ AC_ALLELE1"; bcftools query --format "%CHROM\t%POS\t%REF\t%ALT\t%MACH{0}\t%MACH{1}\t%MACH{2}\t%AS{0}\t%AS{1}\t%pbinom_AS\n" "~{filebase}.bcf" | \ awk -F"\t" '{if ($6*($5-$6/2)==0) si=1; else si=($5*$7/$6-$6)/($5-$6/2); nc=$8+$9; es=log(($9+.5)/($8+.5)); se=sqrt(1/($8+.5)+1/($9+.5)); lp=$10/10; af=$6/$5/2; ac=$6; print $1,$2,$3,$4,$5,si,nc,es,se,lp,af,ac}') | \ bcftools +munge --no-version -c REGENIE --fai "~{basename(fasta_fai)}" -s ~{pheno_name} | \ bcftools merge --no-version --merge none --no-index --output "~{filebase}.gwas.bcf" --output-type b "~{filebase}.bcf" --write-index - bcftools query -f "\n" -i 'sum(~{as_id})>1' "~{filebase}.gwas.bcf" | wc -l rm "~{filebase}.bcf" "~{filebase}.bcf.csi" echo "~{sep("\n", select_all([ref_fasta, fasta_fai, gff3_file, rsid_vcf_file, rsid_vcf_idx]))}" | \ cat - $vcf_files $vcf_idxs | sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm >>> output { Int n = read_int(stdout()) File gwas_vcf_file = filebase + ".gwas.bcf" File gwas_vcf_idx = filebase + ".gwas.bcf.csi" } runtime { memory: memory + " GiB" disks: "local-disk " + disk_size + " HDD" cpu: cpu docker: docker preemptible: preemptible maxRetries: maxRetries } } task assoc_plot { input { File vcf_file File vcf_idx String region File? cyto_file Boolean csq = false String filebase String docker Int cpu = 1 Int? disk_size_override Float memory = 3.5 Int preemptible = 1 Int maxRetries = 0 } Float vcf_size = size(vcf_file, "GiB") Int disk_size = select_first([disk_size_override, ceil(10.0 + vcf_size)]) command <<< set -euo pipefail mv "~{vcf_file}" . mv "~{vcf_idx}" . ~{if defined(cyto_file) then "mv \"" + select_first([cyto_file]) + "\" ." else ""} assoc_plot.R \ ~{if defined(cyto_file) then "--cytoband \"" + basename(select_first([cyto_file])) + "\"" else ""} \ --vcf "~{basename(vcf_file)}" \ --as \ ~{if csq then "--csq" else ""} \ --region ~{region} \ --png "~{filebase}.png" rm "~{basename(vcf_file)}" rm "~{basename(vcf_idx)}" ~{if defined(cyto_file) then "rm \"" + basename(select_first([cyto_file])) + "\"" else ""} >>> output { File png_file = filebase + '.png' } runtime { docker: docker cpu: cpu disks: "local-disk " + disk_size + " HDD" memory: memory + " GiB" preemptible: preemptible maxRetries: maxRetries } }