version development

## Copyright (c) 2020-2021 Giulio Genovese
##
## Version 2021-03-15
##
## Contact Giulio Genovese <giulio.genovese@gmail.com>
##
## This WDL workflow runs MoChA on a cohort of samples genotyped with either Illumina or Affymetrix DNA microarrays
##
## Cromwell version support
## - Successfully tested on v58
##
## Distributed under terms of the MIT License

struct Reference {
  String name
  File fasta
  Int n_chrs
  String mhc_reg
  String kir_reg
  File dup_file
  File genetic_map_file
  File cnp_file
  File cyto_file
  String panel_pfx
  String panel_sfx
  String panel_idx
  Int n_panel_smpls
}

workflow mocha {
  input {
    String sample_set_id
    String mode # idat gtc cel chp txt vcf pvcf
    String target = "pngs" # vcf pvcf calls pngs
    Boolean realign = false
    Boolean wgs = false
    Boolean gtc_output = false # only for idat mode
    Boolean chp_output = false # only for cel mode
    Int idat_batch_size = 48
    Int gtc_batch_size = 1024
    Int chp_batch_size = 1024
    Float max_win_size_cm = 300.0
    Float overlap_size_cm = 5.0
    Float sample_call_rate_thr = 0.97
    Float variant_call_rate_thr = 0.97
    Float baf_auto_thr = 0.03
    String ext_string = "as"

    String ref_name = "GRCh38"
    String? ref_path
    String? ref_fasta
    Int? n_chrs
    String? mhc_reg
    String? kir_reg
    String? dup_file
    String? genetic_map_file
    String? cnp_file
    String? cyto_file
    String? panel_pfx
    String? panel_sfx
    String? panel_idx
    Int? n_panel_smpls

    String manifest_path = ""
    String? data_path
    File sample_tsv_file # sample_id batch_id green_idat red_idat gtc cel chp computed_gender call_rate
    File batch_tsv_file # batch_id path bpm csv egt sam xml zip probeset_ids snp report calls confidences summary vcf vcf_index xcl_vcf xcl_vcf_index
    File? ped_file
    File? duplicate_samples_file
    File? extra_xcl_vcf_file
    String? phase_extra_args
    String? mocha_extra_args
    String? mocha_plot_extra_args
    String basic_bash_docker = "ubuntu:latest"
    String pandas_docker = "amancevice/pandas:slim"
    String bcftools_docker = "us.gcr.io/mccarroll-mocha/bcftools:1.11-20210315"
    String iaap_cli_docker = "us.gcr.io/mccarroll-mocha/iaap_cli:1.11-20210315"
    String autoconvert_docker = "us.gcr.io/mccarroll-mocha/autoconvert:1.11-20210315"
    String apt_docker = "us.gcr.io/mccarroll-mocha/apt:1.11-20210315"
    String shapeit4_docker = "us.gcr.io/mccarroll-mocha/shapeit4:1.11-20210315"
    String eagle_docker = "us.gcr.io/mccarroll-mocha/eagle:1.11-20210315"
    String mocha_plot_docker = "us.gcr.io/mccarroll-mocha/mocha_plot:1.11-20210315"
    Boolean autoconvert = false
    Boolean? table_output
    Boolean? do_not_check_bpm
    Boolean? do_not_use_reference
    Boolean eagle = false
    String delim = "~"
    Array[String]? chip_type
    String tags = "GT,BAF,LRR"
    Int? gc_window_size
  }

  Boolean mode_is_vcf = mode == "vcf" || mode == "pvcf"

  String ref_path_with_sep = if defined(ref_path) then select_first([ref_path]) + (if sub(select_first([ref_path]), "/$", "") != select_first([ref_path]) then "" else "/") else ""
  Reference ref = object {
    name: ref_name,
    fasta: ref_path_with_sep + select_first([ref_fasta, if ref_name == "GRCh38" then "GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" else if ref_name == "GRCh37" then "human_g1k_v37.fasta" else None]),
    n_chrs: select_first([n_chrs, 23]),
    mhc_reg: select_first([mhc_reg, if ref_name == "GRCh38" then "chr6:27518932-33480487" else if ref_name == "GRCh37" then "6:27486711-33448264" else None]),
    kir_reg: select_first([kir_reg, if ref_name == "GRCh38" then "chr19:54071493-54992731" else if ref_name == "GRCh37" then "19:54574747-55504099" else None]),
    dup_file: ref_path_with_sep + select_first([dup_file, if ref_name == "GRCh38" || ref_name == "GRCh37" then "segdups.bed.gz" else None]),
    genetic_map_file: ref_path_with_sep + select_first([genetic_map_file, if ref_name == "GRCh38" then "genetic_map_hg38_withX.txt.gz" else if ref_name == "GRCh37" then "genetic_map_hg19_withX.txt.gz" else None]),
    cnp_file: ref_path_with_sep + select_first([cnp_file, if ref_name == "GRCh38" || ref_name == "GRCh37" then "cnps.bed" else None]),
    cyto_file: ref_path_with_sep + select_first([cyto_file, if ref_name == "GRCh38" || ref_name == "GRCh37" then "cytoBand.txt.gz" else None]),
    panel_pfx: ref_path_with_sep + select_first([panel_pfx, if ref_name == "GRCh38" then "CCDG_14151_B01_GRM_WGS_2020-08-05_" else if ref_name == "GRCh37" then "ALL.chr" else None]),
    panel_sfx: select_first([panel_sfx, if ref_name == "GRCh38" then ".filtered.phased.bcf" else if ref_name == "GRCh37" then ".phase3_integrated.20130502.genotypes.bcf" else None]),
    panel_idx: select_first([panel_idx, ".csi"]),
    n_panel_smpls: select_first([n_panel_smpls, if ref_name == "GRCh38" then 3202 else if ref_name == "GRCh37" then 2504 else None])
  }

  # read table with batches information (scatter could be avoided if there was a tail() function)
  call tsv_sorted as batch_sorted_tsv { input: tsv_file = batch_tsv_file, column = "batch_id", docker = basic_bash_docker }
  Array[Array[String]] batch_tsv = read_tsv(batch_sorted_tsv.file)
  Int n_batches = length(batch_tsv)-1
  scatter (idx in range(n_batches)) { Array[String] batch_tsv_rows = batch_tsv[(idx+1)] }
  Map[String, Array[String]] batch_tbl = as_map(zip(batch_tsv[0], transpose(batch_tsv_rows)))
  Array[String] batches = batch_tbl["batch_id"]

  String manifest_path_with_sep = manifest_path + (if manifest_path == "" || sub(manifest_path, "/$", "") != manifest_path then "" else "/")
  # compute data paths for each batch, if available (scatter could be avoided if there was a contains_key() function)
  scatter (key in keys(batch_tbl)) { Boolean? is_key_equal_path = if key == "path" then true else None }
  scatter (idx in range(n_batches)) {
    String data_paths = select_first([data_path, if length(select_all(is_key_equal_path))>0 then batch_tbl["path"][idx] else ""])
    String data_paths_with_sep = data_paths + (if data_paths == "" || sub(data_paths, "/$", "") != data_paths then "" else "/")
    String pfxs = sample_set_id + (if n_batches == 1 then "" else "." + batches[idx])
  }

  # aligns manifest file to human genome reference if requested
  if (realign && !mode_is_vcf) {
    # hack due to lack of unique function
    Array[String] csv_files = keys(collect_by_key(zip(batch_tbl["csv"], batches)))
    scatter (csv_file in csv_files) {
      call csv2bam {
        input:
          plugin = if mode == "idat" || mode == "gtc" then "gtc2vcf" else "affy2vcf",
          csv_file = manifest_path_with_sep + csv_file,
          ref_fasta = ref.fasta,
          ref_fasta_idxs = prefix(ref.fasta + ".", ["amb", "ann", "bwt", "pac", "sa"]),
          docker = bcftools_docker,
      }
    }
    Map[String, File] csv2sam = as_map(zip(csv_files, csv2bam.bam_file))
  }
  # compute sam file for each batch, if available (scatter could be avoided if there was a contains_key() function)
  scatter (key in keys(batch_tbl)) { Boolean? is_key_equal_sam = if key == "sam" then true else None }
  scatter (idx in range(n_batches)) {
    File? sams = if realign && !mode_is_vcf then select_first([csv2sam])[(batch_tbl["csv"][idx])]
      else if length(select_all(is_key_equal_sam))>0 then manifest_path_with_sep + batch_tbl["sam"][idx] else None
  }

  if (!mode_is_vcf) {
    # resort table with sample information and extract sample_id column
    call tsv_sorted as sample_sorted_tsv { input: tsv_file = sample_tsv_file, column = "batch_id", docker = basic_bash_docker }
    call tsv_column as sample_id_lines { input: tsv_file = sample_sorted_tsv.file, column = "sample_id", docker = basic_bash_docker }
    call tsv_column as batch_id_lines { input: tsv_file = sample_sorted_tsv.file, column = "batch_id", docker = basic_bash_docker }

    # process Illumina data
    if (mode == "idat") {
      call tsv_column as green_idat_lines { input: tsv_file = sample_sorted_tsv.file, column = "green_idat", docker = basic_bash_docker }
      call tsv_column as red_idat_lines { input: tsv_file = sample_sorted_tsv.file, column = "red_idat", docker = basic_bash_docker }
      # group samples by IDAT batches
      call batch_scatter as idat { input: batch_id_file = batch_id_lines.file, sub_batch_size = idat_batch_size, delim = delim, docker = basic_bash_docker }
      Map[String, Array[String]] idat_batch2green_idat_files = collect_by_key(zip(read_lines(idat.sub_batch_id), read_lines(green_idat_lines.file)))
      Map[String, Array[String]] idat_batch2red_idat_files = collect_by_key(zip(read_lines(idat.sub_batch_id), read_lines(red_idat_lines.file)))
      scatter (idx in range(length(idat.sub_batches))) {
        call idat2gtc {
          input:
            bpm_file = manifest_path_with_sep + batch_tbl["bpm"][(idat.idxs[idx])],
            egt_file = manifest_path_with_sep + batch_tbl["egt"][(idat.idxs[idx])],
            green_idat_files = prefix(data_paths_with_sep[(idat.idxs[idx])], idat_batch2green_idat_files[(idat.sub_batches[idx])]),
            red_idat_files = prefix(data_paths_with_sep[(idat.idxs[idx])], idat_batch2red_idat_files[(idat.sub_batches[idx])]),
            autoconvert = autoconvert,
            filebase = sample_set_id + "." + idat.sub_batches[idx],
            docker = if autoconvert then autoconvert_docker else iaap_cli_docker,
        }
      }
      call tsv_concat as green_idat_tsv { input: tsv_files = idat2gtc.green_idat_tsv, filebase = sample_set_id + ".green_idat", docker = basic_bash_docker }
      call tsv_concat as red_idat_tsv { input: tsv_files = idat2gtc.red_idat_tsv, filebase = sample_set_id + ".red_idat", docker = basic_bash_docker }
    }

    if (mode == "gtc") {
      call tsv_column as gtc_lines { input: tsv_file = sample_sorted_tsv.file, column = "gtc", docker = basic_bash_docker }
    }

    if (mode == "idat" || mode == "gtc") {
      # group samples by GTC batches
      call batch_scatter as gtc { input: batch_id_file = batch_id_lines.file, sub_batch_size = gtc_batch_size, delim = delim, docker = basic_bash_docker }
      call get_reheader_maps as gtc_reheader { input: batches = gtc.sub_batches, barcodes_file = select_first([green_idat_lines.file, gtc_lines.file]), sample_id_file = sample_id_lines.file, batch_id_file = gtc.sub_batch_id, docker = basic_bash_docker }
      Array[String]+ input_gtc_files = if mode == "idat" then flatten(select_first([idat2gtc.gtc_files])) else read_lines(select_first([gtc_lines.file]))
      Map[String, Array[String]] gtc_batch2gtc_files = collect_by_key(zip(read_lines(gtc.sub_batch_id), input_gtc_files))
      scatter (idx in range(length(gtc.sub_batches))) {
        call gtc2vcf {
          input:
            tags = tags,
            bpm_file = manifest_path_with_sep + batch_tbl["bpm"][(gtc.idxs[idx])],
            csv_file = manifest_path_with_sep + batch_tbl["csv"][(gtc.idxs[idx])],
            egt_file = manifest_path_with_sep + batch_tbl["egt"][(gtc.idxs[idx])],
            ref_fasta = ref.fasta,
            ref_fasta_fai = ref.fasta + ".fai",
            gc_window_size = gc_window_size,
            gtc_files = prefix(if mode == "idat" then "" else data_paths_with_sep[(gtc.idxs[idx])], gtc_batch2gtc_files[(gtc.sub_batches[idx])]),
            do_not_check_bpm = do_not_check_bpm,
            sam_file = sams[(gtc.idxs[idx])],
            reheader_file = gtc_reheader.map_files[idx],
            filebase = sample_set_id + "." + gtc.sub_batches[idx],
            docker = bcftools_docker,
        }
      }
      call tsv_concat as gtc_tsv { input: tsv_files = gtc2vcf.gtc_tsv, filebase = sample_set_id + ".gtc", docker = basic_bash_docker }

      # this job can be long, so it is better to run as non-preemptible
      Map[Int, Array[String]] idx2gtc2vcf_files = collect_by_key(zip(gtc.idxs, gtc2vcf.vcf_file))
      Map[Int, Array[String]] idx2gtc2vcf_idxs = collect_by_key(zip(gtc.idxs, gtc2vcf.vcf_idx))
      Map[Int, Array[String]] idx2gtc2vcf_n_smpls = collect_by_key(zip(gtc.idxs, gtc2vcf.n_smpls))
      scatter (idx in range(n_batches)) {
        if (length(idx2gtc2vcf_files[idx]) > 1) {
          call vcf_merge as gtc2vcf_merge {
            input:
              vcf_files = idx2gtc2vcf_files[idx],
              filebase = pfxs[idx],
              docker = bcftools_docker
          }
        }
        File gtc2vcf_files = select_first([gtc2vcf_merge.vcf_file, idx2gtc2vcf_files[idx][0]])
        File gtc2vcf_idxs = select_first([gtc2vcf_merge.vcf_idx, idx2gtc2vcf_idxs[idx][0]])
        Int gtc2vcf_n_smpls = select_first([gtc2vcf_merge.n_smpls, idx2gtc2vcf_n_smpls[idx][0]])
      }
    }

    if (mode == "cel" || mode == "chp" || mode == "txt") {
      scatter (key in keys(batch_tbl)) { Boolean? is_key_equal_probeset = if key == "probeset_ids" then true else None }
    }

    if (mode == "cel" || mode == "txt") {
      call tsv_column as cel_lines { input: tsv_file = sample_sorted_tsv.file, column = "cel", docker = basic_bash_docker }
    }

    # process Affymetrix data
    if (mode == "cel") {
      Map[String, Array[String]] batch2cel_files = collect_by_key(zip(read_lines(batch_id_lines.file), read_lines(select_first([cel_lines.file]))))
      scatter (idx in range(n_batches)) {
        call cel2affy as cel2chp {
          input:
            xml_file = manifest_path_with_sep + batch_tbl["xml"][idx],
            zip_file = manifest_path_with_sep + batch_tbl["zip"][idx],
            cel_files = prefix(data_paths_with_sep[idx], batch2cel_files[(batches[idx])]),
            probeset_file = if length(select_all(select_first([is_key_equal_probeset])))>0 && batch_tbl["probeset_ids"][idx] != ""
              then manifest_path_with_sep + batch_tbl["probeset_ids"][idx] else None,
            chip_type = chip_type,
            table_output = table_output,
            filebase = pfxs[idx],
            docker = apt_docker
        }
      }
      call tsv_concat as cel_tsv { input: tsv_files = cel2chp.cel_tsv, filebase = sample_set_id + ".cel", docker = basic_bash_docker }
    }

    if (mode == "chp") {
      call tsv_column as chp_lines { input: tsv_file = sample_sorted_tsv.file, column = "chp", docker = basic_bash_docker }
    }

    if (mode == "cel" || mode == "chp") {
      # group samples by CHP batches
      call batch_scatter as chp { input: batch_id_file = batch_id_lines.file, sub_batch_size = chp_batch_size, delim = delim, docker = basic_bash_docker }
      call get_reheader_maps as chp_reheader { input: batches = chp.sub_batches, barcodes_file = select_first([cel_lines.file, chp_lines.file]), sample_id_file = sample_id_lines.file, batch_id_file = chp.sub_batch_id, docker = basic_bash_docker }
      Array[String]+ input_chp_files = if mode == "cel" then flatten(select_first([cel2chp.chp_files])) else read_lines(select_first([chp_lines.file]))
      Map[String, Array[String]] chp_batch2chp_files = collect_by_key(zip(read_lines(chp.sub_batch_id), input_chp_files))
      scatter (idx in range(length(chp.sub_batches))) {
        call chp2vcf {
          input:
            tags = tags,
            csv_file = manifest_path_with_sep + batch_tbl["csv"][(chp.idxs[idx])],
            ref_fasta = ref.fasta,
            ref_fasta_fai = ref.fasta + ".fai",
            gc_window_size = gc_window_size,
            probeset_file = if length(select_all(select_first([is_key_equal_probeset])))>0 && batch_tbl["probeset_ids"][(chp.idxs[idx])] != ""
              then manifest_path_with_sep + batch_tbl["probeset_ids"][(chp.idxs[idx])] else None,
            snp_file = if mode == "cel" then select_first([cel2chp.snp_file])[(chp.idxs[idx])] else data_paths_with_sep[(chp.idxs[idx])] + batch_tbl["snp"][(chp.idxs[idx])],
            chp_files = prefix(if mode == "cel" then "" else data_paths_with_sep[(chp.idxs[idx])], chp_batch2chp_files[(chp.sub_batches[idx])]),
            sam_file = sams[(chp.idxs[idx])],
            reheader_file = chp_reheader.map_files[idx],
            filebase = sample_set_id + "." + chp.sub_batches[idx],
            docker = bcftools_docker
        }
      }

      # this job can be long, so it is better to run as non-preemptible
      Map[Int, Array[String]] idx2chp2vcf_files = collect_by_key(zip(chp.idxs, chp2vcf.vcf_file))
      Map[Int, Array[String]] idx2chp2vcf_idxs = collect_by_key(zip(chp.idxs, chp2vcf.vcf_idx))
      Map[Int, Array[String]] idx2chp2vcf_n_smpls = collect_by_key(zip(chp.idxs, chp2vcf.n_smpls))
      scatter (idx in range(n_batches)) {
        if (length(idx2chp2vcf_files[idx]) > 1) {
          call vcf_merge as chp2vcf_merge {
            input:
              vcf_files = idx2chp2vcf_files[idx],
              filebase = pfxs[idx],
              docker = bcftools_docker
          }
        }
        File chp2vcf_files = select_first([chp2vcf_merge.vcf_file, idx2chp2vcf_files[idx][0]])
        File chp2vcf_idxs = select_first([chp2vcf_merge.vcf_idx, idx2chp2vcf_idxs[idx][0]])
        Int chp2vcf_n_smpls = select_first([chp2vcf_merge.n_smpls, idx2chp2vcf_n_smpls[idx][0]])
      }
    }

    if (mode == "txt") {
      call get_reheader_maps as txt_reheader { input: batches = batches, barcodes_file = select_first([cel_lines.file]), sample_id_file = sample_id_lines.file, batch_id_file = batch_id_lines.file, docker = basic_bash_docker }
      scatter (key in keys(batch_tbl)) { Boolean? is_key_equal_confidences = if key == "confidences" then true else None }
      scatter (idx in range(n_batches)) {
        # this job can be long, so it is better to run as non-preemptible
        call txt2vcf {
          input:
            tags = tags,
            csv_file = manifest_path_with_sep + batch_tbl["csv"][idx],
            ref_fasta = ref.fasta,
            ref_fasta_fai = ref.fasta + ".fai",
            gc_window_size = gc_window_size,
            probeset_file = if length(select_all(select_first([is_key_equal_probeset])))>0 && batch_tbl["probeset_ids"][idx] != ""
              then manifest_path_with_sep + batch_tbl["probeset_ids"][idx] else None,
            calls_file = data_paths_with_sep[idx] + batch_tbl["calls"][idx],
            confidences_file = if length(select_all(is_key_equal_confidences))>0 then data_paths_with_sep[idx] + batch_tbl["confidences"][idx] else None,
            summary_file = data_paths_with_sep[idx] + batch_tbl["summary"][idx],
            report_file = data_paths_with_sep[idx] + batch_tbl["report"][idx],
            snp_file = data_paths_with_sep[idx] + batch_tbl["snp"][idx],
            sam_file = sams[idx],
            reheader_file = txt_reheader.map_files[idx],
            filebase = pfxs[idx],
            docker = bcftools_docker
        }
      }
    }

    if (mode == "cel" || mode == "chp" || mode == "txt") {
      call tsv_concat as affy_tsv { input: tsv_files = select_first([chp2vcf.affy_tsv, txt2vcf.affy_tsv]), filebase = sample_set_id + ".affy", docker = basic_bash_docker }
    }

    Array[File] unphased_vcf_files = select_first([gtc2vcf_files, chp2vcf_files, txt2vcf.vcf_file])
    Array[File] unphased_vcf_idxs = select_first([gtc2vcf_idxs, chp2vcf_idxs, txt2vcf.vcf_idx])
    Array[Int] unphased_vcf_n_smpls = select_first([gtc2vcf_n_smpls, chp2vcf_n_smpls, txt2vcf.n_smpls])
    call lst_flatten as flatten_sample_id_lines { input: lst_files = select_first([gtc2vcf.sample_id_lines, chp2vcf.sample_id_lines, txt2vcf.sample_id_lines]), filebase = "sample_id", docker = basic_bash_docker }
  }

  if (mode_is_vcf) {
    call tsv_column as vcf_sample_id_lines { input: tsv_file = sample_tsv_file, column = "sample_id", docker = basic_bash_docker }
  }

  call tsv_column as computed_gender_lines { input: tsv_file = select_first([gtc_tsv.file, affy_tsv.file, sample_tsv_file]), column = "computed_gender", docker = basic_bash_docker }
  call tsv_column as call_rate_lines { input: tsv_file = select_first([gtc_tsv.file, affy_tsv.file, sample_tsv_file]), column = "call_rate", docker = basic_bash_docker }
  if (mode != "pvcf" && target != "vcf") {
    Array[Array[String]] ref_fasta_fai_tbl = transpose(read_tsv(ref.fasta + ".fai"))
    scatter (idx in range(ref.n_chrs)) {
      String chrs = ref_fasta_fai_tbl[0][idx]
      Int lens = ref_fasta_fai_tbl[1][idx]
    }
    call ref_scatter {
      input:
        chrs = chrs,
        lens = lens,
        genetic_map_file = ref.genetic_map_file,
        max_win_size_cm = max_win_size_cm,
        overlap_size_cm = overlap_size_cm,
        docker = pandas_docker
    }
    scatter (idx in range(n_batches)) {
      call vcf_scatter {
        input:
          vcf_file = if mode_is_vcf then data_paths_with_sep[idx] + batch_tbl["vcf"][idx] else select_first([unphased_vcf_files])[idx],
          intervals_bed = ref_scatter.intervals_bed,
          docker = bcftools_docker
      }
    }
    if (defined(extra_xcl_vcf_file)) {
      call vcf_scatter as xcl_vcf_scatter {
        input:
          vcf_file = select_first([extra_xcl_vcf_file]),
          intervals_bed = ref_scatter.intervals_bed,
          docker = bcftools_docker
      }
    }

    call lst_concat as sample_id_split_tsv { input: lst_files = vcf_scatter.sample_id_lines, filebase = "split_sample_id", docker = basic_bash_docker }
    Int n_smpls = length(flatten(read_tsv(sample_id_split_tsv.file)))
    Boolean use_reference = !select_first([do_not_use_reference, n_smpls > 2 * ref.n_panel_smpls])
    Array[Array[File]] interval_slices = transpose(vcf_scatter.vcf_files)
    Array[Array[String]] intervals_tbl = transpose(read_tsv(ref_scatter.intervals_bed))
    String mhc_chr = sub(ref.mhc_reg, ":.*$", "")
    Int mhc_beg = sub(sub(ref.mhc_reg, "-.*$", ""), "^.*:", "")
    Int mhc_end = sub(ref.mhc_reg, "^.*-", "")
    scatter (idx in range(length(intervals_tbl[0]))) {
      call vcf_merge {
        input:
          vcf_files = interval_slices[idx],
          filebase = sample_set_id + "." + idx,
          docker = bcftools_docker
      }
      call vcf_qc {
        input:
          vcf_file = vcf_merge.vcf_file,
          vcf_idx = vcf_merge.vcf_idx,
          dup_file = ref.dup_file,
          sample_id_file = select_first([flatten_sample_id_lines.file, vcf_sample_id_lines.file]),
          computed_gender_file = computed_gender_lines.file,
          call_rate_file = call_rate_lines.file,
          sample_call_rate_thr = sample_call_rate_thr,
          variant_call_rate_thr = variant_call_rate_thr,
          duplicate_samples_file = duplicate_samples_file,
          extra_xcl_vcf_file = if defined(extra_xcl_vcf_file) then select_first([xcl_vcf_scatter.vcf_files])[idx] else None,
          docker = bcftools_docker
      }
      Int beg = intervals_tbl[1][idx] # cast string to integer
      Int end = intervals_tbl[2][idx] # cast string to integer
      call vcf_phase {
        input:
          n_smpls = n_smpls,
          n_markers = vcf_qc.n_markers,
          unphased_vcf_file = vcf_qc.qc_vcf_file,
          unphased_vcf_idx = vcf_qc.qc_vcf_idx,
          genetic_map_file = ref.genetic_map_file,
          n_panel_smpls = if use_reference then ref.n_panel_smpls else None,
          ref_vcf_file = if use_reference then ref.panel_pfx + intervals_tbl[0][idx] + ref.panel_sfx else None,
          ref_vcf_idx = if use_reference then ref.panel_pfx + intervals_tbl[0][idx] + ref.panel_sfx + ref.panel_idx else None,
          ref_fasta_fai = ref.fasta + ".fai",
          chr = intervals_tbl[0][idx],
          mhc = intervals_tbl[0][idx] == mhc_chr && beg < mhc_end && end > mhc_beg,
          wgs = wgs,
          eagle = eagle,
          phase_extra_args = phase_extra_args,
          docker = if eagle then eagle_docker else shapeit4_docker
      }
      call vcf_split {
        input:
          vcf_file = vcf_phase.pgt_vcf_file,
          batches = batches,
          sample_id_file = sample_id_split_tsv.file,
          ped_file = ped_file,
          rule = if defined(ped_file) then ref.name else None,
          docker = bcftools_docker
      }
    }

    call vcf_concat as xcl_vcf_concat {
      input:
        vcf_files = vcf_qc.xcl_vcf_file,
        filebase = sample_set_id + ".xcl",
        docker = bcftools_docker
    }

    Array[Array[File]] batch_slices = transpose(vcf_split.vcf_files)
    scatter (idx in range(n_batches)) {
      call vcf_concat {
        input:
          vcf_files = batch_slices[idx],
          filebase = pfxs[idx] + ".pgt",
          docker = bcftools_docker
      }
      call vcf_import {
        input:
          pgt_vcf_file = vcf_concat.vcf_file,
          pgt_vcf_idx = vcf_concat.vcf_idx,
          unphased_vcf_file = if mode_is_vcf then data_paths_with_sep[idx] + batch_tbl["vcf"][idx] else select_first([unphased_vcf_files])[idx],
          unphased_vcf_idx = if mode_is_vcf then data_paths_with_sep[idx] + batch_tbl["vcf_index"][idx] else select_first([unphased_vcf_idxs])[idx],
          docker = bcftools_docker
      }
    }
  }

  if (target == "calls" || target == "pngs") {
    scatter (idx in range(n_batches)) {
      call vcf_mocha {
        input:
          rule = ref.name,
          pvcf_file = if mode == "pvcf" then data_paths_with_sep[idx] + batch_tbl["vcf"][idx] else select_first([vcf_import.vcf_file])[idx],
          pvcf_idx = if mode == "pvcf" then data_paths_with_sep[idx] + batch_tbl["vcf_index"][idx] else select_first([vcf_import.vcf_idx])[idx],
          sample_id_file = select_first([flatten_sample_id_lines.file, vcf_sample_id_lines.file]),
          computed_gender_file = computed_gender_lines.file,
          call_rate_file = call_rate_lines.file,
          xcl_vcf_file = if mode == "pvcf" then data_paths_with_sep[idx] + batch_tbl["xcl_vcf"][idx] else xcl_vcf_concat.vcf_file,
          xcl_vcf_idx = if mode == "pvcf" then data_paths_with_sep[idx] + batch_tbl["xcl_vcf_index"][idx] else xcl_vcf_concat.vcf_idx,
          cnp_file = ref.cnp_file,
          mhc_reg = if ref.mhc_reg != "" then ref.mhc_reg else None,
          kir_reg = if ref.kir_reg != "" then ref.kir_reg else None,
          mocha_extra_args = mocha_extra_args,
          ext_string = ext_string,
          docker = bcftools_docker
      }
      if (target == "pngs") {
        call mocha_plot {
          input:
            vcf_file = vcf_mocha.mocha_vcf_file,
            vcf_idx = vcf_mocha.mocha_vcf_idx,
            stats_tsv = vcf_mocha.stats_tsv,
            calls_tsv = vcf_mocha.calls_tsv,
            cyto_file = ref.cyto_file,
            call_rate_thr = sample_call_rate_thr,
            baf_auto_thr = baf_auto_thr,
            mocha_plot_extra_args = mocha_plot_extra_args,
            wgs = wgs,
            docker = mocha_plot_docker
        }
      }
    }
    call tsv_concat as mocha_stats_tsv { input: tsv_files = vcf_mocha.stats_tsv, filebase = sample_set_id + ".stats", docker = basic_bash_docker }
    call tsv_concat as mocha_calls_tsv { input: tsv_files = vcf_mocha.calls_tsv, filebase = sample_set_id + ".calls", docker = basic_bash_docker }
    call mocha_summary {
      input:
        calls_tsv = mocha_calls_tsv.file,
        stats_tsv = mocha_stats_tsv.file,
        ucsc_beds = vcf_mocha.ucsc_bed,
        cyto_file = ref.cyto_file,
        filebase = sample_set_id,
        call_rate_thr = sample_call_rate_thr,
        baf_auto_thr = baf_auto_thr,
        docker = mocha_plot_docker
    }
  }

  # generate a table summarizing the main output files and serialize the table to disk
  # vcf_files and vcf_idxs are defined in the output section
  scatter (idx in range(n_batches)) {
    String basename_vcf_files = basename(vcf_files[idx])
    String basename_vcf_idxs = basename(vcf_idxs[idx])
    String basename_xcl_vcf_files = basename(if mode == "pvcf" then batch_tbl["xcl_vcf"][idx] else select_first([xcl_vcf_concat.vcf_file, ""]))
    String basename_xcl_vcf_idxs = basename(if mode == "pvcf" then batch_tbl["xcl_vcf_index"][idx] else select_first([xcl_vcf_concat.vcf_idx, ""]))
    String basename_pgt_vcf_files = basename(if mode != "pvcf" && target != "vcf" then select_first([vcf_concat.vcf_file])[idx] else "")
    String basename_pgt_vcf_idxs = basename(if mode != "pvcf" && target != "vcf" then select_first([vcf_concat.vcf_idx])[idx] else "")
  }
  Array[Pair[String,Array[String]]] output_pairs = flatten([[("batch_id", batch_tbl["batch_id"]),
    ("n_smpls", select_first([vcf_mocha.n_smpls, vcf_import.n_smpls, unphased_vcf_n_smpls])),
    ("vcf", basename_vcf_files),
    ("vcf_index", basename_vcf_idxs)],
    if mode == "pvcf" || target == "vcf" then [] else
    [("xcl_vcf", basename_xcl_vcf_files),
    ("xcl_vcf_index", basename_xcl_vcf_idxs),
    ("pgt_vcf", basename_pgt_vcf_files),
    ("pgt_vcf_index", basename_pgt_vcf_idxs)]])
  Map[String, Array[String]] output_map = as_map(output_pairs)
  # cannot use output_keys = keys(output_map) because of unresolved Cromwell bug
  # https://github.com/broadinstitute/cromwell/issues/5559
  scatter (p in output_pairs) {
    String output_keys = p.left
    Array[String] output_tsv_cols = output_map[p.left]
  }
  # this is run as a separate task rather than using write_tsv() as Cromwell can break the WDL specification
  # https://support.terra.bio/hc/en-us/community/posts/360071465631-write-lines-write-map-write-tsv-write-json-fail-when-run-in-a-workflow-rather-than-in-a-task
  call write_tsv {
    input:
      tsv = flatten([[output_keys], transpose(output_tsv_cols)]),
      filebase = sample_set_id + ".output",
      docker = basic_bash_docker
  }

  output {
    File? green_idat_tsv_file = green_idat_tsv.file
    File? red_idat_tsv_file = red_idat_tsv.file
    File? gtc_tsv_file = gtc_tsv.file
    File? cel_tsv_file = cel_tsv.file
    File? affy_tsv_file = affy_tsv.file
    File sample_id_file = select_first([flatten_sample_id_lines.file, vcf_sample_id_lines.file])
    File computed_gender_file = computed_gender_lines.file
    File call_rate_file = call_rate_lines.file
    File? mocha_stats_file = mocha_stats_tsv.file
    File? mocha_calls_file = mocha_calls_tsv.file
    File? mocha_ucsc_bed = mocha_summary.ucsc_bed
    File? mocha_summary_pdf = mocha_summary.summary_pdf
    File? mocha_pileup_pdf = mocha_summary.pileup_pdf
    Array[File]? png_files = if target == "pngs" then flatten(select_all(select_first([mocha_plot.png_files]))) else None
    Array[File]? mendel_files = if mode != "pvcf" && target != "vcf" && defined(ped_file) then select_all(select_first([vcf_split.mendel_tsv])) else None
    Array[File]? gtc_files = if mode == "idat" && gtc_output then flatten(select_first([idat2gtc.gtc_files])) else None
    Array[File]? chp_files = if mode == "cel" && chp_output then flatten(select_first([cel2chp.chp_files])) else None
    Array[File]? snp_files = if mode == "cel" && chp_output then select_first([cel2chp.snp_file]) else None
    Array[File] vcf_files = select_first([vcf_mocha.mocha_vcf_file, vcf_import.vcf_file, unphased_vcf_files])
    Array[File] vcf_idxs = select_first([vcf_mocha.mocha_vcf_idx, vcf_import.vcf_idx, unphased_vcf_idxs])
    File? xcl_vcf_file = xcl_vcf_concat.vcf_file
    File? xcl_vcf_idx = xcl_vcf_concat.vcf_idx
    Array[File]? pgt_vcf_files = vcf_concat.vcf_file
    Array[File]? pgt_vcf_idxs = vcf_concat.vcf_idx
    File output_tsv_file = write_tsv.file
  }

  meta {
    author: "Giulio Genovese (with help from Chris Whelan)"
    email: "giulio.genovese@gmail.com"
    description: "See the [MoChA](https://github.com/freeseek/mocha) website for more information."
  }
}

task tsv_sorted {
  input {
    File tsv_file
    String column

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  String filebase = basename(tsv_file, ".tsv")

  command <<<
    set -euo pipefail
    mv "~{tsv_file}" .
    col=$(head -n1 "~{basename(tsv_file)}" | tr '\t' '\n' | awk -F"\t" '$0=="~{column}" {print NR}')
    if [ "$col" == "" ]; then
      echo "Column \"~{column}\" does not exist" 1>&2
      exit 1
    fi
    cat "~{basename(tsv_file)}" | (read -r; printf "%s\n" "$REPLY"; sort -k $col,$col -s -t $'\t') > "~{filebase}.sorted.tsv"
    rm "~{basename(tsv_file)}"
  >>>

  output {
    File file = filebase + ".sorted.tsv"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task tsv_column {
  input {
    File tsv_file
    String column

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    mv "~{tsv_file}" .
    col=$(head -n1 "~{basename(tsv_file)}" | tr '\t' '\n' | awk -F"\t" '$0=="~{column}" {print NR}')
    if [ "$col" == "" ]; then
      echo "Column \"~{column}\" does not exist" 1>&2
      exit 1
    fi
    ~{if column != "call_rate" then "tail -n+2 \"" + basename(tsv_file) + "\" | cut -f$col > \"" + column + ".lines\""
    else "max_call_rate=$(tail -n+2 \"" + basename(tsv_file) + "\" | cut -f$col | sort -g | tail -n1)\n" +
    "tail -n+2 \"" + basename(tsv_file) + "\" | cut -f$col | if [[ $max_call_rate > 1.0 ]]; then awk '{print $0/100}'; else cat; fi > \"" + column + ".lines\"\n"}
    rm "~{basename(tsv_file)}"
  >>>

  output {
    File file = column + ".lines"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task tsv_concat {
  input {
    Array[File]+ tsv_files
    String filebase

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    tsv_files=~{write_lines(tsv_files)}
    ~{if length(tsv_files) > 1 then
    "cat $tsv_files | tr '\\n' '\\0' | xargs -0 mv -t .\n" +
    "sed -i 's/^.*\\///' $tsv_files\n" +
    "(head -n1 \"" + basename(tsv_files[0]) + "\";\n" +
    "cat $tsv_files | tr '\\n' '\\0' | xargs -0 tail -qn+2) > \"" + filebase + ".tsv\"\n" +
    "cat $tsv_files | tr '\\n' '\\0' | xargs -0 rm"
    else "mv \"" + tsv_files[0] + "\" \"" + filebase + ".tsv\""}
  >>>

  output {
    File file = filebase + ".tsv"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task lst_flatten {
  input {
    Array[File] lst_files
    String filebase

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    lst_files=~{write_lines(lst_files)}
    cat $lst_files | tr '\n' '\0' | xargs -0 mv -t .
    sed -i 's/^.*\///' $lst_files
    cat $lst_files | tr '\n' '\0' | xargs -0 awk 1 > "~{filebase}.lines"
    cat $lst_files | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File file = filebase + ".lines"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task lst_concat {
  input {
    Array[File] lst_files
    String filebase

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    lst_files=~{write_lines(lst_files)}
    cat $lst_files | tr '\n' '\0' | xargs -0 mv -t .
    sed -i 's/^.*\///' $lst_files
    cat $lst_files | tr '\n' '\0' | xargs -0 -n 1 awk '{printf $0"\t"} END {printf "\n"}' | sed 's/\t$//' > "~{filebase}.tsv"
    cat $lst_files | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File file = filebase + ".tsv"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

# this task generates sub batches from batches and then returns them in the same order as batches
# batch_id_file should contain batch names without the delim string, or else behavior is undefined
task batch_scatter {
  input {
    File batch_id_file
    Int sub_batch_size
    String delim

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    mv "~{batch_id_file}" .
    awk -F"\t" 'NR==FNR {x[$0]++} NR>FNR {n=x[$0]/int((x[$0]-1)/~{sub_batch_size}+1);
      if (x[$0]>n) print $0"~{delim}"int(y[$0]/n); else print $0; y[$0]++}' \
      "~{basename(batch_id_file)}" "~{basename(batch_id_file)}" > sub_batch_ids.lines
    uniq sub_batch_ids.lines
    uniq sub_batch_ids.lines | cut -d"~{delim}" -f1 | awk '!x[$0]++ {idx++} {print idx-1}' 1>&2
    rm "~{basename(batch_id_file)}"
  >>>

  output {
    File sub_batch_id = "sub_batch_ids.lines"
    Array[String] sub_batches = read_lines(stdout())
    Array[Int] idxs = read_lines(stderr())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task get_reheader_maps {
  input {
    Array[String] batches
    File barcodes_file
    File sample_id_file
    File batch_id_file

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    batches=~{write_lines(batches)}
    mv "~{barcodes_file}" .
    mv "~{sample_id_file}" .
    mv "~{batch_id_file}" .
    mkdir maps
    sed 's/^.*\///;s/.gz$//;s/_Grn.idat$//;s/.gtc$//;s/.CEL$//;s/.AxiomGT1.chp$//;s/.birdseed-v2.chp$//' "~{basename(barcodes_file)}" | \
      paste -d$'\t' - "~{basename(sample_id_file)}" | \
      paste -d$'\t' - "~{basename(batch_id_file)}" | \
      awk -F"\t" -v OFS="\t" '{print $1,$2>"maps/"$3".map"}'
    sed 's/^/maps\//;s/$/.map/' $batches
    rm "~{basename(barcodes_file)}"
    rm "~{basename(sample_id_file)}"
    rm "~{basename(batch_id_file)}"
  >>>

  output {
    Directory maps = "maps"
    # cannot use suffix(batches, ".map") because of unresolved Cromwell bug
    # https://github.com/broadinstitute/cromwell/issues/5549
    Array[File] map_files = read_lines(stdout())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task write_tsv {
  input {
    Array[Array[String]] tsv
    String filebase

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    mv ~{write_tsv(tsv)} "~{filebase}.tsv"
  >>>

  output {
    File file = filebase + ".tsv"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task csv2bam {
  input {
    String plugin = "gtc2vcf"
    File csv_file
    File ref_fasta
    Array[File]+ ref_fasta_idxs

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float csv_size = (if basename(csv_file) != basename(csv_file, ".gz") then 4.0 else 1.0) * size(csv_file, "GiB")
  Float ref_size = size(ref_fasta, "GiB")
  Float index_size = size(ref_fasta_idxs, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * csv_size + ref_size + index_size)])
  # if gtc2vcf was memory efficient this requirement could be relaxed
  Float memory = select_first([memory_override, ceil(7.25 + 2.5 * csv_size)])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])

  command <<<
    set -euo pipefail
    echo "~{sep="\n" flatten([[csv_file, ref_fasta], ref_fasta_idxs])}" | \
      tr '\n' '\0' | xargs -0 mv -t .
    bcftools +~{plugin} \
      --csv "~{basename(csv_file)}" \
      --fasta-flank | \
      bwa mem~{if cpu > 1 then " -t " + cpu else ""} -M "~{basename(ref_fasta)}" - | \
      samtools view -bS -o "~{basename(basename(csv_file, ".gz"), ".csv")}.bam"
    echo "~{sep="\n" flatten([[csv_file, ref_fasta], ref_fasta_idxs])}" | \
      sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File bam_file = basename(basename(csv_file, ".gz"), ".csv") + ".bam"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

# https://support.terra.bio/hc/en-us/community/posts/360071476431-Terra-fails-to-delocalize-files-listed-through-read-lines-
task idat2gtc {
  input {
    File bpm_file
    File egt_file
    Array[File]+ green_idat_files
    Array[File]+ red_idat_files
    Boolean is_gender_autocall = true # from dsde-pipelines/tasks/IlluminaGenotypingArrayTasks.wdl
    Boolean autoconvert = false
    String filebase

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0

    Float mult = 6.0 # to estimate the amount of memory required given the size of the manifest files
  }

  Float bpm_size = (if basename(bpm_file) != basename(bpm_file, ".gz") then 4.0 else 1.0) * size(bpm_file, "GiB")
  Float egt_size = (if basename(egt_file) != basename(egt_file, ".gz") then 2.0 else 1.0) * size(egt_file, "GiB")
  Float green_idat_size = length(green_idat_files) * size(green_idat_files[0], "GiB")
  Float red_idat_size = length(red_idat_files) * size(red_idat_files[0], "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + bpm_size + egt_size + 4.0 * (green_idat_size + red_idat_size))])
  Float memory = select_first([memory_override, 3.5 + mult * (bpm_size + egt_size)])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])

  command <<<
    set -euo pipefail
    green_idat_files=~{write_lines(green_idat_files)}
    red_idat_files=~{write_lines(red_idat_files)}
    mv "~{bpm_file}" .
    mv "~{egt_file}" .
    cat $green_idat_files $red_idat_files | tr '\n' '\0' | xargs -0 mv -t .
    sed -i 's/^.*\///' $green_idat_files $red_idat_files
    ~{if basename(bpm_file) != basename(bpm_file, ".gz") then "gunzip --force \"" + basename(bpm_file) + "\"" else ""}
    ~{if basename(egt_file) != basename(egt_file, ".gz") then "gunzip --force \"" + basename(egt_file) + "\"" else ""}
    (grep -h "\.gz" $green_idat_files $red_idat_files || if [[ $? -eq 1 ]]; then true; else exit $?; fi) | \
      tr '\n' '\0' | xargs -0 gunzip --force
    sed -i 's/.gz$//' $green_idat_files $red_idat_files
    bcftools +gtc2vcf --idat --gtcs $green_idat_files --output "~{filebase}.green_idat.tsv"
    bcftools +gtc2vcf --idat --gtcs $red_idat_files --output "~{filebase}.red_idat.tsv"
    ~{if autoconvert then
      "sed -i 's/^   <NumberOfThreads>4<\\/NumberOfThreads>\\r$/   <NumberOfThreads>" + cpu + "<\\/NumberOfThreads>\\r/' \\\n" +
      "  \"/opt/AutoConvert 2.0/AutoCallConfig.xml\"\n" +
      "mkdir gtcs\n" +
      "cat $green_idat_files | tr '\\n' '\\0' | \\\n" +
      "  xargs -0 -i mono \\\n" +
      "  \"/opt/AutoConvert 2.0/AutoConvert.exe\" \\\n" +
      "  \"{}\" \\\n" +
      "  gtcs \\\n" +
      "  \"" + basename(bpm_file, ".gz") + "\" \\\n" +
      "  \"" + basename(egt_file, ".gz") + "\" \\\n"
    else
      "iaap-cli \\\n" +
      "  gencall \\\n" +
      "  \"" + basename(bpm_file, ".gz") + "\" \\\n" +
      "  \"" + basename(egt_file, ".gz") + "\" \\\n" +
      "  gtcs \\\n" +
      "  --idat-folder . \\\n" +
      "  --output-gtc \\\n" +
      (if is_gender_autocall then "  --gender-estimate-call-rate-threshold -0.1 \\\n" else "") +
      (if cpu > 1 then "  --num-threads " + cpu + " \\\n" else "")}  1>&2
    sed 's/^/gtcs\//;s/_Grn\.idat$/.gtc/' $green_idat_files
    rm "~{basename(bpm_file, ".gz")}"
    rm "~{basename(egt_file, ".gz")}"
    cat $green_idat_files $red_idat_files | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File green_idat_tsv = filebase + ".green_idat.tsv"
    File red_idat_tsv = filebase + ".red_idat.tsv"
    Directory gtcs = "gtcs"
    Array[File] gtc_files = read_lines(stdout())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

# https://support.terra.bio/hc/en-us/community/posts/360071476431-Terra-fails-to-delocalize-files-listed-through-read-lines-
task cel2affy {
  input {
    File? xml_file
    File? zip_file
    Array[File]+ cel_files
    File? cdf
    File? chrX_snps
    File? special_snps
    File? chrX_probes
    File? chrY_probes
    File? probeset_file
    Array[String]? chip_type
    Boolean table_output = false
    String? qmethod_spec
    File? read_models_brlmmp
    String? set_analysis_name
    File? target_sketch
    String? set_gender_method
    String? em_gender
    Float? female_thresh
    Float? male_thresh
    String filebase

    String docker
    Int? cpu_override # unfortunately this task is not parallelizable
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 0
    Int maxRetries = 0
  }

  Float xml_size = size(xml_file, "GiB")
  Float zip_size = size(zip_file, "GiB")
  Float cel_size = length(cel_files) * size(cel_files[0], "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + xml_size + zip_size + (if table_output then 8.0 else 6.0) * cel_size)])
  Float memory = select_first([memory_override, 2 * 7.25])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])

  String? zip_dir = if defined(zip_file) then basename(select_first([zip_file]), ".zip") else None
  String analysis_name = if defined(set_analysis_name) then set_analysis_name else "AxiomGT1"

  command <<<
    set -euo pipefail
    cel_files=~{write_lines(cel_files)}
    echo "~{sep="\n" select_all([xml_file, zip_file, cdf, chrX_snps, special_snps, chrX_probes, chrY_probes, read_models_brlmmp, target_sketch])}" | \
      cat - $cel_files | tr '\n' '\0' | xargs -0 mv -t .
    sed -i 's/^.*\///' $cel_files
    (grep "\.gz" $cel_files  || if [[ $? -eq 1 ]]; then true; else exit $?; fi) | \
      tr '\n' '\0' | xargs -0 gunzip --force
    sed -i 's/.gz$//' $cel_files
    ~{if defined(zip_file) then "unzip -jd \"" + zip_dir + "\" \"" + basename(select_first([zip_file])) + "\" 1>&2" else ""}
    bcftools +affy2vcf --cel --chps $cel_files --output "~{filebase}.cel.tsv"
    echo "cel_files" | cat - $cel_files > cel_files.lines
    apt-probeset-genotype \
      ~{if defined(zip_dir) then "--analysis-files-path \"" + zip_dir + "\"" else ""} \
      ~{if defined(xml_file) then "--xml-file \"" + basename(select_first([xml_file])) + "\"" else ""} \
      --cel-files cel_files.lines \
      ~{if defined(cdf) then "--cdf-file \"" + basename(select_first([cdf])) + "\"" else ""} \
      ~{if defined(chrX_snps) then "--chrX-snps \"" + basename(select_first([chrX_snps])) + "\"" else ""} \
      ~{if defined(special_snps) then "--special-snps \"" + basename(select_first([special_snps])) + "\"" else ""} \
      ~{if defined(chrX_probes) then "--chrX-probes \"" + basename(select_first([chrX_probes])) + "\"" else ""} \
      ~{if defined(chrY_probes) then "--chrY-probes \"" + basename(select_first([chrY_probes])) + "\"" else ""} \
      ~{if defined(probeset_file) then "--probeset-ids \"" + select_first([probeset_file]) + "\"" else ""} \
      ~{if defined(chip_type) then "--chip-type " else ""}~{sep=" --chip-type " chip_type} \
      --table-output ~{table_output} \
      --cc-chp-output \
      ~{if table_output then "--summaries" else ""} \
      ~{if defined(qmethod_spec) then "--qmethod-spec " + qmethod_spec else ""} \
      ~{if defined(read_models_brlmmp) then "--read-models-brlmmp \"" + basename(select_first([read_models_brlmmp])) + "\"" else ""} \
      --write-models \
      ~{if defined(target_sketch) then "--target-sketch \"" + basename(select_first([target_sketch])) + "\"" else ""} \
      ~{if defined(set_analysis_name) then "--set-analysis-name " + set_analysis_name else ""} \
      ~{if defined(set_gender_method) then "--set-gender-method " + set_gender_method else ""} \
      ~{if defined(em_gender) then "--em-gender " + em_gender else ""} \
      ~{if defined(female_thresh) then "--female-thresh " + female_thresh else ""} \
      ~{if defined(male_thresh) then "--male-thresh " + male_thresh else ""}
    for sfx in snp-posteriors report~{if table_output then " calls confidences summary normalized-summary" else ""}; do
      mv "~{analysis_name}.$sfx.txt" "~{filebase}.$sfx.txt"
    done
    sed 's/^/cc-chp\//;s/\.CEL$/.~{analysis_name}.chp/' $cel_files
    echo "~{sep="\n" select_all([xml_file, zip_file, cdf, chrX_snps, special_snps, chrX_probes, chrY_probes, read_models_brlmmp, target_sketch])}" | \
      sed 's/^.*\///' | cat - $cel_files | tr '\n' '\0' | xargs -0 rm
    ~{if defined(zip_file) then "rm -r \"" + zip_dir + "\"" else ""}
    rm cel_files.lines
  >>>

  output {
    File cel_tsv = filebase + ".cel.tsv"
    Directory chps = "cc-chp"
    Array[File] chp_files = read_lines(stdout())
    File log_file = "apt-probeset-genotype.log"
    File snp_file = filebase + ".snp-posteriors.txt"
    File report_file = filebase + ".report.txt"
    File? calls_file = if table_output then filebase + ".calls.txt" else None
    File? confidences_file = if table_output then filebase + ".confidences.txt" else None
    File? summary_file = if table_output then filebase + ".summary.txt" else None
    File? normalized_summary_file = if table_output then filebase + ".normalized-summary.txt" else None
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task gtc2vcf {
  input {
    String? tags
    File bpm_file
    File csv_file
    File egt_file
    File ref_fasta
    File ref_fasta_fai
    Int? gc_window_size
    Array[File]+ gtc_files
    Int capacity = 32768
    Boolean use_gtc_sample_names = false
    Boolean do_not_check_bpm = false
    File? sam_file
    File? reheader_file
    String filebase
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float bpm_size = (if basename(bpm_file) != basename(bpm_file, ".gz") then 4.0 else 1.0) * size(bpm_file, "GiB")
  Float csv_size = (if basename(csv_file) != basename(csv_file, ".gz") then 4.0 else 1.0) * size(csv_file, "GiB")
  Float egt_size = (if basename(egt_file) != basename(egt_file, ".gz") then 2.0 else 1.0) * size(egt_file, "GiB")
  Float ref_size = size(ref_fasta, "GiB")
  Float gtc_size = length(gtc_files) * size(gtc_files[0], "GiB")
  Float sam_size = size(sam_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + bpm_size + csv_size + egt_size + ref_size + 8.0 * gtc_size + sam_size)])
  # due to heavy random access to the reference genome, it is important here that enough memory to cache the reference is provided
  Float memory = select_first([memory_override, 3.5 + 2.0 * csv_size + 2.0 * egt_size + ref_size +
    length(gtc_files) * capacity * 19 / 1024 / 1024 / 1024])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])

  command <<<
    set -euo pipefail
    gtc_files=~{write_lines(gtc_files)}
    echo "~{sep="\n" select_all([bpm_file, csv_file, egt_file, ref_fasta, ref_fasta_fai, sam_file, reheader_file])}" | \
      cat - $gtc_files | tr '\n' '\0' | xargs -0 mv -t .
    sed -i 's/^.*\///' $gtc_files
    ~{if basename(bpm_file) != basename(bpm_file, ".gz") then "gunzip --force \"" + basename(bpm_file) + "\"" else ""}
    ~{if basename(egt_file) != basename(egt_file, ".gz") then "gunzip --force \"" + basename(egt_file) + "\"" else ""}
    (grep "\.gz" $gtc_files || if [[ $? -eq 1 ]]; then true; else exit $?; fi) | \
      tr '\n' '\0' | xargs -0 gunzip --force
    sed -i 's/.gz$//' $gtc_files
    bcftools +gtc2vcf \
      --no-version \
      --output-type u \
      ~{if defined(tags) then "--tags " + tags else ""} \
      --bpm "~{basename(bpm_file, ".gz")}" \
      --csv "~{basename(csv_file)}" \
      --egt "~{basename(egt_file, ".gz")}" \
      --fasta-ref "~{basename(ref_fasta)}" \
      ~{if defined(gc_window_size) then "--gc-window-size " + select_first([gc_window_size]) else ""} \
      --gtcs $gtc_files \
      ~{if capacity != 32768 then "--capacity " + capacity else ""} \
      ~{if use_gtc_sample_names then "--use-gtc-sample-names" else ""} \
      ~{if do_not_check_bpm then "--do-not-check-bpm" else ""} \
      --extra "~{filebase}.gtc.tsv" \
      ~{if cpu > 1 then "--threads " + (cpu - 1) else ""} \
      ~{if defined(sam_file) then "--sam-flank \"" + basename(select_first([sam_file])) + "\"" else ""} | \
      ~{if defined(reheader_file) then "bcftools reheader --samples \"" + basename(select_first([reheader_file])) + "\" |" else ""} \
      bcftools sort --output-type u --temp-dir ./bcftools-sort.XXXXXX | \
      bcftools norm --no-version --output-type ~{if uncompressed then "u" else "b"} --output "~{filebase}.bcf" --check-ref x --fasta-ref "~{basename(ref_fasta)}"~{if cpu > 1 then " --threads " + (cpu - 1) else ""}
    bcftools index --force "~{filebase}.bcf"
    bcftools query --list-samples "~{filebase}.bcf" | tee "~{filebase}.sample_id.lines" | wc -l
    rm "~{basename(bpm_file, ".gz")}"
    rm "~{basename(egt_file, ".gz")}"
    echo "~{sep="\n" select_all([csv_file, ref_fasta, ref_fasta_fai, sam_file, reheader_file])}" | \
      sed 's/^.*\///' | cat - $gtc_files | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File gtc_tsv = filebase + ".gtc.tsv"
    File vcf_file = filebase + ".bcf"
    File vcf_idx = filebase + ".bcf.csi"
    File sample_id_lines = filebase + ".sample_id.lines"
    Int n_smpls = read_int(stdout())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task chp2vcf {
  input {
    String? tags
    File csv_file
    File ref_fasta
    File ref_fasta_fai
    Int? gc_window_size
    File? probeset_file
    File snp_file
    Array[File] chp_files
    File? sam_file
    File? reheader_file
    String filebase
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float csv_size = (if basename(csv_file) != basename(csv_file, ".gz") then 4.0 else 1.0) * size(csv_file, "GiB")
  Float ref_size = size(ref_fasta, "GiB")
  Float snp_size = (if basename(snp_file) != basename(snp_file, ".gz") then 2.0 else 1.0) * size(snp_file, "GiB")
  Float chp_size = length(chp_files) * size(chp_files[0], "GiB")

  Float sam_size = size(sam_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + csv_size + ref_size + 8.0 * chp_size + snp_size + sam_size)])
  # due to heavy random access to the reference genome, it is important here that enough memory to cache the reference is provided
  Float memory = select_first([memory_override, 3.5 + 2.0 * csv_size + 2.0 * snp_size + ref_size +
    length(chp_files) * 32768 / 1024 / 1024 / 1024])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])

  command <<<
    set -euo pipefail
    chp_files=~{write_lines(chp_files)}
    echo "~{sep="\n" select_all([csv_file, ref_fasta, ref_fasta_fai, snp_file, sam_file, reheader_file])}" | \
      cat - $chp_files | tr '\n' '\0' | xargs -0 mv -t .
    sed -i 's/^.*\///' $chp_files
    (grep "\.gz" $chp_files || if [[ $? -eq 1 ]]; then true; else exit $?; fi) | \
      tr '\n' '\0' | xargs -0 gunzip --force
    sed -i 's/.gz$//' $chp_files
    bcftools +affy2vcf \
      --no-version \
      --output-type u \
      ~{if defined(tags) then "--tags " + tags else ""} \
      --csv "~{basename(csv_file)}" \
      --fasta-ref "~{basename(ref_fasta)}" \
      ~{if defined(gc_window_size) then "--gc-window-size " + select_first([gc_window_size]) else ""} \
      ~{if defined(probeset_file) then "--probeset-ids \"" + select_first([probeset_file]) + "\"" else ""} \
      --snp "~{basename(snp_file)}" \
      --chps $chp_files \
      --extra "~{filebase}.affy.tsv" \
      ~{if cpu > 1 then "--threads " + (cpu - 1) else ""} \
      ~{if defined(sam_file) then "--sam-flank \"" + basename(select_first([sam_file])) + "\"" else ""} | \
      ~{if defined(reheader_file) then "bcftools reheader --samples \"" + basename(select_first([reheader_file])) + "\" |" else ""} \
      bcftools sort --output-type u --temp-dir ./bcftools-sort.XXXXXX | \
      bcftools norm --no-version --output-type ~{if uncompressed then "u" else "b"} --output "~{filebase}.bcf" --check-ref x --fasta-ref "~{basename(ref_fasta)}"~{if cpu > 1 then " --threads " + (cpu - 1) else ""}
    bcftools index --force "~{filebase}.bcf"
    bcftools query --list-samples "~{filebase}.bcf" | tee "~{filebase}.sample_id.lines" | wc -l
    echo "~{sep="\n" select_all([csv_file, ref_fasta, ref_fasta_fai, snp_file, sam_file, reheader_file])}" | \
      sed 's/^.*\///' | cat - $chp_files | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File affy_tsv = filebase + ".affy.tsv"
    File vcf_file = filebase + ".bcf"
    File vcf_idx = filebase + ".bcf.csi"
    File sample_id_lines = filebase + ".sample_id.lines"
    Int n_smpls = read_int(stdout())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task txt2vcf {
  input {
    String? tags
    File csv_file
    File ref_fasta
    File ref_fasta_fai
    Int? gc_window_size
    File? probeset_file
    File calls_file
    File? confidences_file
    File summary_file
    File report_file
    File snp_file
    File? sam_file
    File? reheader_file
    String filebase
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float csv_size = (if basename(csv_file) != basename(csv_file, ".gz") then 4.0 else 1.0) * size(csv_file, "GiB")
  Float ref_size = size(ref_fasta, "GiB")
  Float calls_size = (if basename(calls_file) != basename(calls_file, ".gz") then 2.0 else 1.0) * size(calls_file, "GiB")
  Float confidences_size = (if defined(confidences_file) && basename(select_first([confidences_file])) != basename(select_first([confidences_file]), ".gz") then 2.0 else 1.0) * size(confidences_file, "GiB")
  Float summary_size = (if basename(summary_file) != basename(summary_file, ".gz") then 2.0 else 1.0) * size(summary_file, "GiB")
  Float report_size = (if basename(report_file) != basename(report_file, ".gz") then 2.0 else 1.0) * size(report_file, "GiB")
  Float snp_size = (if basename(snp_file) != basename(snp_file, ".gz") then 2.0 else 1.0) * size(snp_file, "GiB")

  Float sam_size = size(sam_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + csv_size + ref_size + 8.0 * (calls_size + confidences_size + summary_size) + 2.0 * report_size + snp_size + sam_size)])
  # due to heavy random access to the reference genome, it is important here that enough memory to cache the reference is provided
  Float memory = select_first([memory_override, 3.5 + 2.0 * csv_size + 2.0 * snp_size + ref_size])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])

  command <<<
    set -euo pipefail
        echo "~{sep="\n" select_all([csv_file, ref_fasta, ref_fasta_fai, calls_file, confidences_file, summary_file, report_file, snp_file, sam_file, reheader_file])}" | \
      tr '\n' '\0' | xargs -0 mv -t .
    ~{if basename(report_file) != basename(report_file, ".gz") then "gunzip --force \"" + basename(report_file) + "\"" else ""}
    (~{if basename(calls_file) != basename(calls_file, ".gz") then "z" else ""}grep -v ^# "~{basename(calls_file)}" || if [[ $? -eq 141 ]]; then true; else exit $?; fi) | \
      head -n1 | tr '\t' '\n' | tail -n+2 | \
      awk -F"\t" 'NR==FNR && $0!~"^#" {if ($1=="cel_files") print; else x[$1]=$0} NR>FNR {print x[$1]}' \
      "~{basename(report_file, ".gz")}" - > ~{filebase}.affy.tsv
    rm "~{basename(report_file, ".gz")}"
    bcftools +affy2vcf \
      --no-version \
      --output-type u \
      ~{if defined(tags) then "--tags " + tags else ""} \
      --csv "~{basename(csv_file)}" \
      --fasta-ref "~{basename(ref_fasta)}" \
      ~{if defined(gc_window_size) then "--gc-window-size " + select_first([gc_window_size]) else ""} \
      ~{if defined(probeset_file) then "--probeset-ids \"" + select_first([probeset_file]) + "\"" else ""} \
      --calls "~{basename(calls_file)}" \
      ~{if defined(confidences_file) then "--confidences \"" + basename(select_first([confidences_file])) + "\"" else ""} \
      --summary "~{basename(summary_file)}" \
      --snp "~{basename(snp_file)}" \
      ~{if cpu > 1 then "--threads " + (cpu - 1) else ""} \
      ~{if defined(sam_file) then "--sam-flank \"" + basename(select_first([sam_file])) + "\"" else ""} | \
      ~{if defined(reheader_file) then "bcftools reheader --samples \"" + basename(select_first([reheader_file])) + "\" |" else ""} \
      bcftools sort --output-type u --temp-dir ./bcftools-sort.XXXXXX | \
      bcftools norm --no-version --output-type ~{if uncompressed then "u" else "b"} --output "~{filebase}.bcf" --check-ref x --fasta-ref "~{basename(ref_fasta)}"~{if cpu > 1 then " --threads " + (cpu - 1) else ""}
    bcftools index --force "~{filebase}.bcf"
    bcftools query --list-samples "~{filebase}.bcf" | tee "~{filebase}.sample_id.lines" | wc -l
    echo "~{sep="\n" select_all([csv_file, ref_fasta, ref_fasta_fai, calls_file, confidences_file, summary_file, snp_file, sam_file, reheader_file])}" | \
      sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File affy_tsv = filebase + ".affy.tsv"
    File vcf_file = filebase + ".bcf"
    File vcf_idx = filebase + ".bcf.csi"
    File sample_id_lines = filebase + ".sample_id.lines"
    Int n_smpls = read_int(stdout())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task ref_scatter {
  input {
    Array[String] chrs
    Array[String] lens
    File genetic_map_file
    Float max_win_size_cm
    Float overlap_size_cm

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    mv "~{genetic_map_file}" .
    chrs=~{write_lines(chrs)}
    lens=~{write_lines(lens)}
    paste -d $'\t' $chrs $lens > chr2len.tsv
    python3 <<CODE
    import sys, pandas as pd, numpy as np
    chr2len = {}
    with open('chr2len.tsv') as f:
      for line in f:
        (key, val) = line.split('\t')
        chr2len[key] = int(val)
    df_map = pd.read_csv('~{basename(genetic_map_file)}', delim_whitespace = True, header = 0, names = ['CHR', 'POS' ,'RATE', 'CM'])
    df_out = {}
    for chr, df_group in df_map.groupby('CHR'):
      fai_chr = str(chr) if str(chr) in chr2len else 'chr' + str(chr) if 'chr' + str(chr) in chr2len else 'X' if 'X' in chr2len else 'chrX' if 'chrX' in chr2len else None
      if fai_chr:
        chr_cm_len = max(df_group['CM'])
        n_win = np.ceil((chr_cm_len - ~{overlap_size_cm})/(~{max_win_size_cm} - ~{overlap_size_cm}))
        win_size = (chr_cm_len - ~{overlap_size_cm}) / n_win + ~{overlap_size_cm}
        cm_begs = (win_size - ~{overlap_size_cm}) * np.arange(1, n_win)
        cm_ends = (win_size - ~{overlap_size_cm}) * np.arange(1, n_win) + ~{overlap_size_cm}
        pos_begs = np.concatenate(([1], np.interp(cm_begs, df_group['CM'], df_group['POS'], period = np.inf).astype(int)))
        pos_ends = np.concatenate((np.interp(cm_ends, df_group['CM'], df_group['POS'], period = np.inf).astype(int), [chr2len[fai_chr]]))
        df_out[fai_chr] = pd.DataFrame.from_dict({'CHR': fai_chr, 'BEG': pos_begs, 'END': pos_ends})
    df = pd.concat([df_out[fai_chr] for fai_chr in chr2len.keys()])
    df[['CHR', 'BEG', 'END']].to_csv('ref_scatter.bed', sep='\t', header = False, index = False)
    CODE
    rm chr2len.tsv
    rm "~{basename(genetic_map_file)}"
  >>>

  output {
    File intervals_bed = "ref_scatter.bed"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

# https://support.terra.bio/hc/en-us/community/posts/360071476431-Terra-fails-to-delocalize-files-listed-through-read-lines-
task vcf_scatter {
  input {
    File vcf_file
    File intervals_bed
    String? extra
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float vcf_size = size(vcf_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + 3.0 * vcf_size)])
  Float memory = select_first([memory_override, 3.5])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])
  String filebase = basename(basename(vcf_file, ".bcf"), ".vcf.gz")

  command <<<
    set -euo pipefail
    mv "~{vcf_file}" .
    mv "~{intervals_bed}" .
    awk -F"\t" '{print $1":"$2"-"$3"\t"NR-1}' "~{basename(intervals_bed)}" > regions.lines
    bcftools query --list-samples "~{basename(vcf_file)}" | tee "~{filebase}.sample_id.lines" | wc -l > n_smpls.int
    bcftools annotate \
      --no-version \
      --output-type u \
      --remove ID,QUAL,INFO,^FMT/GT \
      ~{if cpu > 1 then "--threads " + (cpu - 1) else ""} \
      "~{basename(vcf_file)}" | \
    bcftools +scatter \
      --no-version \
      --output-type ~{if uncompressed then "u" else "b"} \
      --output vcfs \
      ~{if cpu > 1 then "--threads " + (cpu - 1) else ""} \
      --scatter-file regions.lines \
      --prefix "~{filebase}." \
      ~{if defined(extra) then "--extra \"" + extra + "\"" else ""}
    cut -f2 regions.lines | sed 's/^/vcfs\/~{filebase}./;s/$/.bcf/'
    rm "~{basename(vcf_file)}"
    rm "~{basename(intervals_bed)}"
    rm regions.lines
  >>>

  output {
    File sample_id_lines = filebase + ".sample_id.lines"
    Int n_smpls = read_int("n_smpls.int")
    Directory vcfs = "vcfs"
    Array[File] vcf_files = read_lines(stdout())
    File? extra_vcf_file = if defined(extra) then "vcfs/" + filebase + "." + extra + ".bcf" else None
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task vcf_merge {
  input {
    Array[File]+ vcf_files
    String filebase
    Boolean wgs = false
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float vcf_size = size(vcf_files, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10 + 2.0 * vcf_size)])
  Float memory = select_first([memory_override, 3.5])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])

  command <<<
    set -euo pipefail
    ~{if length(vcf_files) > 1 then "vcf_files=" else ""}~{if length(vcf_files) > 1 then write_lines(vcf_files) else ""}
    ~{if length(vcf_files) > 1 then "cat $vcf_files | tr '\\n' '\\0' | xargs -0 mv -t .\n" +
      "sed -i 's/^.*\\///' $vcf_files\n" +
      "bcftools merge \\\n" +
      "  --no-version \\\n" +
      "  --output-type " + (if uncompressed then "u" else "b") + " \\\n" +
      "  --output \"" + filebase + ".bcf\" \\\n" +
      (if wgs then "--missing-to-ref \\\n" else "") +
      "  --file-list $vcf_files \\\n" +
      "  --merge none \\\n" +
      "  --no-index \\\n" +
      (if cpu > 1 then "  --threads " + (cpu - 1) else "")
    else "mv \"" + vcf_files[0] + "\" \"" + filebase + ".bcf\""}
    bcftools index --force "~{filebase}.bcf"
    bcftools query --list-samples "~{filebase}.bcf" | wc -l
    ~{if length(vcf_files) > 1 then "cat $vcf_files | tr '\\n' '\\0' | xargs -0 rm" else ""}
  >>>

  output {
    File vcf_file = filebase + ".bcf"
    File vcf_idx = filebase + ".bcf.csi"
    Int n_smpls = read_int(stdout())
  }

  runtime {
    memory: memory + " GiB"
    disks: "local-disk " + disk_size + " HDD"
    cpu: cpu
    docker: docker
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task vcf_qc {
  input {
    File vcf_file
    File vcf_idx
    File dup_file
    File sample_id_file
    File computed_gender_file
    File call_rate_file
    Float sample_call_rate_thr = 0.97
    Float variant_call_rate_thr = 0.97
    Float dup_divergence_thr = 0.02
    Float genotype_exc_het_thr = 0.000001
    File? duplicate_samples_file
    File? extra_xcl_vcf_file
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float vcf_size = size(vcf_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + vcf_size)])
  Float memory = select_first([memory_override, 3.5])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])
  String filebase = basename(basename(vcf_file, ".bcf"), ".vcf.gz")

  command <<<
    set -euo pipefail
    echo "~{sep="\n" select_all([vcf_file, vcf_idx, dup_file, sample_id_file, computed_gender_file, call_rate_file, duplicate_samples_file, extra_xcl_vcf_file])}" | \
      tr '\n' '\0' | xargs -0 mv -t .
    paste -d $'\t' "~{basename(sample_id_file)}" "~{basename(computed_gender_file)}" > computed_gender.map
    paste -d $'\t' "~{basename(sample_id_file)}" "~{basename(call_rate_file)}" | \
      awk -F"\t" '$2<~{sample_call_rate_thr} {print $1}' ~{if defined(duplicate_samples_file) then "| \\\n" +
      "  cat - \"" + basename(select_first([duplicate_samples_file])) + "\" | \\\n" +
      "  sort | uniq " else ""}> samples_xcl.lines
    echo '##INFO=<ID=JK,Number=1,Type=Float,Description="Jukes Cantor">' | \
      bcftools annotate --no-version --output-type u --annotations "~{basename(dup_file)}" --columns CHROM,FROM,TO,JK --header-lines /dev/stdin~{if cpu > 1 then " --threads " + (cpu - 1) else ""} "~{basename(vcf_file)}" | \
      bcftools view --no-version --output-type u~{if cpu > 1 then " --threads " + (cpu - 1) else ""} --samples-file ^samples_xcl.lines | \
      bcftools +fill-tags --no-version --output-type u --targets ^Y,MT,chrY,chrM~{if cpu > 1 then " --threads " + (cpu - 1) else ""} -- --tags ExcHet,F_MISSING | \
      bcftools annotate --no-version --output-type ~{if uncompressed then "u" else "b"} --output "~{filebase}.xcl.bcf" \
      --include 'FILTER!="." && FILTER!="PASS" || INFO/JK<~{dup_divergence_thr} || INFO/ExcHet<~{genotype_exc_het_thr} || INFO/F_MISSING>1-~{variant_call_rate_thr}' \
      --remove ^INFO/JK,^INFO/ExcHet,^INFO/F_MISSING~{if cpu > 1 then " --threads " + (cpu - 1) else ""}
    bcftools index --force "~{filebase}.xcl.bcf"
    ~{if defined(extra_xcl_vcf_file) then "mv \"" + filebase + ".xcl.bcf\" \"" + filebase + ".tmp.bcf\"\n" +
      "mv \"" + filebase + ".xcl.bcf.csi\" \"" + filebase + ".tmp.bcf.csi\"\n" +
      "bcftools index --force \"" + basename(select_first([extra_xcl_vcf_file])) + "\"\n" +
      "bcftools merge --no-version --output-type " + (if uncompressed then "u" else "b") + " --output \"" + filebase + ".xcl.bcf\" --merge none \"" + filebase + ".tmp.bcf\" \"" + basename(select_first([extra_xcl_vcf_file])) + "\"\n" +
      "bcftools index --force \"" + filebase + ".xcl.bcf\"\n" +
      "rm \"" + filebase + ".tmp.bcf\" \"" + filebase + ".tmp.bcf.csi\"" else ""}
    bcftools isec \
      --no-version \
      --output-type ~{if uncompressed then "u" else "b"} \
      --output "~{filebase}.qc.bcf" \
      --complement \
      --exclude "N_ALT>1" \
      --write 1 \
      "~{basename(vcf_file)}" \
      "~{filebase}.xcl.bcf"
    bcftools index --force "~{filebase}.qc.bcf"
    bcftools index --nrecords "~{filebase}.qc.bcf"
    echo "~{sep="\n" select_all([vcf_file, vcf_idx, dup_file, sample_id_file, computed_gender_file, call_rate_file, duplicate_samples_file, extra_xcl_vcf_file])}" | \
      sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm
    rm computed_gender.map samples_xcl.lines
  >>>

  output {
    File xcl_vcf_file = filebase + ".xcl.bcf"
    File xcl_vcf_idx = filebase + ".xcl.bcf.csi"
    File qc_vcf_file = filebase + ".qc.bcf"
    File qc_vcf_idx = filebase + ".qc.bcf.csi"
    Int n_markers = read_int(stdout())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

# hack https://github.com/samtools/bcftools/issues/1425 is employed in the end to fix the header
task vcf_phase {
  input {
    Int n_markers
    Int n_smpls
    File unphased_vcf_file
    File unphased_vcf_idx
    File genetic_map_file
    Int? n_panel_smpls
    File? ref_vcf_file
    File? ref_vcf_idx
    File? ref_fasta_fai
    String chr
    Boolean mhc = false # requires additional resources if phasing the MHC region
    Boolean wgs = false
    Boolean eagle = false
    Boolean uncompressed = false
    String? phase_extra_args

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? mult_override
    Float? memory_override
    Int? preemptible_override
    Int maxRetries = 0
  }

  Float vcf_size = size(unphased_vcf_file, "GiB")
  Float ref_size = size(ref_vcf_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + 3.0 * vcf_size + ref_size)])
  Float mult = select_first([mult_override, if eagle then 1.5 else 10.0]) # requires additional memory if using SHAPEIT4
  Float memory = select_first([memory_override, 3.5 + mult * n_markers * (n_smpls + select_first([n_panel_smpls, 0])) / 1024 / 1024 / 1024])
  Int cpu = select_first([cpu_override, 2 * ceil(memory / 13)]) * (if mhc then 2 else 1) # always require at least two CPUs
  Int preemptible = select_first([preemptible_override, if mhc then 0 else 1]) # as the MHC phases slowly, do not run as preemptible

  String filebase = basename(basename(unphased_vcf_file, ".bcf"), ".vcf.gz")
  String dollar = "$"

  command <<<
    set -euo pipefail
    echo "~{sep="\n" select_all([unphased_vcf_file, unphased_vcf_idx, genetic_map_file, ref_vcf_file, ref_vcf_idx])}" | \
      tr '\n' '\0' | xargs -0 mv -t .
    ~{if defined(ref_fasta_fai) then "mv \"" + select_first([ref_fasta_fai]) + "\" ." else ""}
    ~{if eagle then
      "bio-eagle \\\n" +
      "  --geneticMapFile \"" + basename(genetic_map_file) + "\" \\\n" +
      "  --outPrefix \"" + filebase + ".pgt\" \\\n" +
      (if cpu > 1 then "  --numThreads " + cpu + " \\\n" else "") +
      (if defined(ref_vcf_file) then "  --vcfRef" + basename(select_first([ref_vcf_file])) + "\" \\\n" +
      "  --vcfTarget \"" else "  --vcf") + " \"" + basename(unphased_vcf_file) + "\" \\\n" +
      "  --vcfOutFormat " + (if uncompressed then "u" else "b") + " \\\n" +
      "  --chrom " + chr + " \\\n" +
      (if defined(phase_extra_args) then phase_extra_args + " \\\n" else "") +
      "  1>&2"
    else
      "chr=" + chr + "; zcat \"" + basename(genetic_map_file) + "\" | \\\n" +
      "  sed 's/^23/X/' | awk -v chr=" + dollar + "{chr#chr} '$1==chr {print $2,$3,$4}' > genetic_map.txt\n" +
      "shapeit4.2 \\\n" +
      (if cpu > 1 then "  --thread " + cpu + " \\\n" else "") +
      "  --input \"" + basename(unphased_vcf_file) + "\" \\\n" +
      (if defined(ref_vcf_file) then "  --reference \"" + basename(select_first([ref_vcf_file])) + "\" \\\n" else "") +
      "  --map genetic_map.txt \\\n" +
      "  --region " + chr + " \\\n" +
      (if wgs then "--sequencing \\\n" else "") +
      "  --output \"" + filebase + ".pgt.bcf\" \\\n" +
      (if defined(phase_extra_args) then phase_extra_args + " \\\n" else "") +
      "  1>&2\n" +
      "rm genetic_map.txt"}
    ~{if defined(ref_fasta_fai) then
      "(echo -en \"##fileformat=VCFv4.2\\n#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\t\"\n" +
      "bcftools query -l \"" + filebase + ".pgt.bcf\" | tr '\\n' '\\t' | sed 's/\\t$/\\n/') > tmp.vcf\n" +
      "bcftools reheader --fai \"" + basename(select_first([ref_fasta_fai])) + "\" --output fai.vcf tmp.vcf\n" +
      "mv  \"" + filebase + ".pgt.bcf\" \"" + filebase + ".tmp.bcf\"\n" +
      "bcftools concat \\\n" +
      "  --no-version \\\n" +
      "  --output-type b \\\n" +
      "  --output \"" + filebase + ".pgt.bcf\" \\\n" +
      "  fai.vcf \"" + filebase + ".tmp.bcf\"\n" +
      "rm \"" + filebase + ".tmp.bcf\" fai.vcf tmp.vcf \"" + basename(select_first([ref_fasta_fai])) + "\""
      else ""}
    bcftools index --force "~{filebase}.pgt.bcf"
    echo "~{sep="\n" select_all([unphased_vcf_file, unphased_vcf_idx, genetic_map_file, ref_vcf_file, ref_vcf_idx])}" | \
      sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File pgt_vcf_file = filebase + ".pgt.bcf"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

# https://support.terra.bio/hc/en-us/community/posts/360071476431-Terra-fails-to-delocalize-files-listed-through-read-lines-
task vcf_split {
  input {
    File vcf_file
    Array[String] batches
    File sample_id_file
    File? ped_file
    String? rule
    Boolean uncompressed = false

    String docker
    Int? disk_size_override
    Int? cpu_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float vcf_size = size(vcf_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * vcf_size)])
  Float memory = select_first([memory_override, 3.5])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])
  String filebase = basename(basename(vcf_file, ".bcf"), ".vcf.gz")

  command <<<
    set -euo pipefail
    filebases=~{write_lines(prefix(filebase + '.', batches))}
    mv "~{vcf_file}" .
    mv "~{sample_id_file}" .
    ~{if defined(ped_file) then "mv -t . \"" + ped_file + "\"" else ""}
    mkdir mendel
    ~{if defined(ped_file) && defined(rule) then "bcftools +mendelian --output \"mendel/" + filebase + ".mendel.tsv\" --rules \"" + select_first([rule]) + "\" --ped \"" + basename(select_first([ped_file])) + "\" \"" + basename(vcf_file) + "\"" else ""}
    sed 's/\t/,/g;s/$/\t-/' "~{basename(sample_id_file)}" | paste -d $'\t' - $filebases > samples_file.txt
    ~{if defined(ped_file) then "bcftools +trio-phase --no-version --output-type u" + (if cpu > 1 then " --threads " + (cpu - 1) else "") + " \"" + basename(vcf_file) + "\" -- --ped \"" + basename(select_first([ped_file])) + "\" | \\\n" +
      "  bcftools +split --output-type " + (if uncompressed then "u" else "b") + " --output vcfs --samples-file samples_file.txt"
      else "bcftools +split --output-type " + (if uncompressed then "u" else "b") + " --output vcfs --samples-file samples_file.txt \"" + basename(vcf_file) + "\""}
    cut -f1 $filebases | sed 's/^/vcfs\//;s/$/.bcf/'
    rm "~{basename(vcf_file)}"
    rm "~{basename(sample_id_file)}"
    ~{if defined(ped_file) then "rm \"" + basename(select_first([ped_file])) + "\"" else ""}
    rm samples_file.txt
  >>>

  output {
    File? mendel_tsv = if defined(ped_file) then "mendel/" + filebase + ".mendel.tsv" else None
    Directory vcfs = "vcfs"
    Array[File] vcf_files = read_lines(stdout())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task vcf_concat {
  input {
    Array[File]+ vcf_files
    Boolean ligate = true
    String filebase
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0

    Float mult = 12.0
  }

  Float vcf_size = size(vcf_files, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * vcf_size)])
  Float memory = select_first([memory_override, 3.5 + if ligate then mult * 2.0 * vcf_size / length(vcf_files) else 0.0])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])

  command <<<
    set -euo pipefail
    vcf_files=~{write_lines(vcf_files)}
    cat $vcf_files | tr '\n' '\0' | xargs -0 mv -t .
    sed -i 's/^.*\///' $vcf_files
    ~{if ligate then "cat $vcf_files | tr '\\n' '\\0' | xargs -0 -n 1 bcftools index --force" else ""}
    bcftools concat \
      --no-version \
      --output-type ~{if uncompressed then "u" else "b"} \
      ~{if ligate then "--compact-PS" else ""} \
      --file-list $vcf_files \
      ~{if ligate then "--ligate" else ""} \
      ~{if cpu > 1 then "--threads " + (cpu - 1) else ""} \
      --output "~{filebase}.bcf"
    cat $vcf_files | tr '\n' '\0' | xargs -0 rm
    ~{if ligate then "cat $vcf_files | sed 's/$/.csi/' | tr '\\n' '\\0' | xargs -0 rm" else ""}
    bcftools index --force "~{filebase}.bcf"
    bcftools query --list-samples "~{filebase}.bcf" | wc -l
  >>>

  output {
    File vcf_file = filebase + ".bcf"
    File vcf_idx = filebase + ".bcf.csi"
    Int n_smpls = read_int(stdout())
  }

  runtime {
    memory: memory + " GiB"
    disks: "local-disk " + disk_size + " HDD"
    cpu: cpu
    docker: docker
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task vcf_import {
  input {
    File pgt_vcf_file
    File pgt_vcf_idx
    File unphased_vcf_file
    File unphased_vcf_idx
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float pgt_size = size(pgt_vcf_file, "GiB")
  Float vcf_size = size(unphased_vcf_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * pgt_size + 2.0 * vcf_size)])
  Float memory = select_first([memory_override, 3.5])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])
  String filebase = basename(basename(unphased_vcf_file, ".bcf"), ".vcf.gz")

  command <<<
    set -euo pipefail
    mv "~{pgt_vcf_file}" .
    mv "~{pgt_vcf_idx}" .
    mv "~{unphased_vcf_file}" .
    mv "~{unphased_vcf_idx}" .
    bcftools annotate \
      --no-version \
      --output-type ~{if uncompressed then "u" else "b"} \
      --output "~{filebase}.phased.bcf" \
      --annotations "~{basename(pgt_vcf_file)}" \
      --columns -FMT/GT \
      ~{if cpu > 1 then "--threads " + (cpu - 1) else ""} \
      "~{basename(unphased_vcf_file)}"
    bcftools index --force "~{filebase}.phased.bcf"
    bcftools query --list-samples "~{filebase}.phased.bcf" | wc -l
    rm "~{basename(pgt_vcf_file)}"
    rm "~{basename(pgt_vcf_idx)}"
    rm "~{basename(unphased_vcf_file)}"
    rm "~{basename(unphased_vcf_idx)}"
  >>>

  output {
    File vcf_file = filebase + ".phased.bcf"
    File vcf_idx = filebase + ".phased.bcf.csi"
    Int n_smpls = read_int(stdout())
  }

  runtime {
    memory: memory + " GiB"
    disks: "local-disk " + disk_size + " HDD"
    cpu: cpu
    docker: docker
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task vcf_mocha {
  input {
    String rule
    File pvcf_file
    File pvcf_idx
    File? sample_id_file
    File? computed_gender_file
    File? call_rate_file
    File? xcl_vcf_file
    File? xcl_vcf_idx
    File? cnp_file
    String? mhc_reg
    String? kir_reg
    String? mocha_extra_args
    String ext_string
    Boolean uncompressed = false

    String docker
    Int? cpu_override
    Int? disk_size_override
    Float? memory_override
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float pvcf_size = size(pvcf_file, "GiB")
  Float xcl_size = size(xcl_vcf_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + 2.0 * pvcf_size + xcl_size)])
  Float memory = select_first([memory_override, ceil(4.0 + 0.1 * pvcf_size)])
  Int cpu = select_first([cpu_override, if memory > 6.5 then 2 * ceil(memory / 13) else 1])
  String filebase = basename(basename(basename(pvcf_file, ".bcf"), ".vcf.gz"), '.phased')

  command <<<
    set -euo pipefail
    echo "~{sep="\n" select_all([pvcf_file, pvcf_idx, sample_id_file, computed_gender_file, call_rate_file, xcl_vcf_file, xcl_vcf_idx, cnp_file])}" | \
      tr '\n' '\0' | xargs -0 mv -t .
    ~{if defined(sample_id_file) && defined(computed_gender_file) then "paste -d $'\\t' \"" + basename(select_first([sample_id_file])) + "\" \"" +
      basename(select_first([computed_gender_file])) + "\" > computed_gender.map" else ""}
    ~{if defined(sample_id_file) && defined(call_rate_file) then "paste -d $'\\t' \"" + basename(select_first([sample_id_file])) + "\" \"" +
      basename(select_first([call_rate_file])) + "\" > call_rate.map" else ""}
    bcftools +mocha \
      --rules "~{rule}" \
      --no-version \
      ~{if defined(sample_id_file) && defined(computed_gender_file) then "--sex computed_gender.map" else ""} \
      ~{if defined(sample_id_file) && defined(call_rate_file) then "--call-rate call_rate.map" else ""} \
      ~{if defined(xcl_vcf_file) then "--variants \"^" + basename(select_first([xcl_vcf_file])) + "\"" else ""} \
      ~{if defined(cnp_file) then "--cnp \"" + basename(select_first([cnp_file])) + "\"" else ""} \
      ~{if defined(mhc_reg) then "--mhc \"" + mhc_reg + "\"" else ""} \
      ~{if defined(kir_reg) then "--kir \"" + kir_reg + "\"" else ""} \
      ~{if cpu > 1 then "--threads " + (cpu - 1) else ""} \
      --output-type ~{if uncompressed then "u" else "b"} \
      --output "~{filebase}.~{ext_string}.bcf" \
      --mosaic-calls "~{filebase}.calls.tsv" \
      --genome-stats "~{filebase}.stats.tsv" \
      --ucsc-bed "~{filebase}.ucsc.bed" \
      "~{basename(pvcf_file)}" \
      ~{mocha_extra_args}
    bcftools index --force "~{filebase}.~{ext_string}.bcf"
    bcftools query --list-samples "~{filebase}.~{ext_string}.bcf" | wc -l
    echo "~{sep="\n" select_all([pvcf_file, pvcf_idx, sample_id_file, computed_gender_file, call_rate_file, xcl_vcf_file, xcl_vcf_idx, cnp_file])}" | \
      sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm
    ~{if defined(sample_id_file) && defined(computed_gender_file) then "rm computed_gender.map" else ""}
  >>>

  output {
    File mocha_vcf_file = filebase + "." + ext_string + ".bcf"
    File mocha_vcf_idx = filebase + "." + ext_string + ".bcf.csi"
    Int n_smpls = read_int(stdout())
    File calls_tsv = filebase + ".calls.tsv"
    File stats_tsv = filebase + ".stats.tsv"
    File ucsc_bed = filebase + ".ucsc.bed"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

# https://support.terra.bio/hc/en-us/community/posts/360071476431-Terra-fails-to-delocalize-files-listed-through-read-lines-
task mocha_plot {
  input {
    File vcf_file
    File vcf_idx
    File calls_tsv
    File stats_tsv
    File cyto_file
    Float call_rate_thr = 0.97
    Float baf_auto_thr = 0.03
    String? mocha_plot_extra_args
    Boolean wgs = false
    Boolean do_not_plot_sex_chromosomes = false

    String docker
    Int cpu = 1
    Int? disk_size_override
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  Float vcf_size = size(vcf_file, "GiB")
  Int disk_size = select_first([disk_size_override, ceil(10.0 + vcf_size)])

  command <<<
    set -euo pipefail
    mv "~{vcf_file}" .
    mv "~{vcf_idx}" .
    mv "~{calls_tsv}" .
    mv "~{stats_tsv}" .
    mv "~{cyto_file}" .
    mkdir pngs
    beg_pos=$(head -n1 "~{basename(calls_tsv)}" | tr '\t' '\n' | grep ^beg_)
    end_pos=$(head -n1 "~{basename(calls_tsv)}" | tr '\t' '\n' | grep ^end_)
    awk -F"\t" -v OFS="\t" -v beg_pos=$beg_pos -v end_pos=$end_pos '
      NR==FNR && FNR==1 {for (i=1; i<=NF; i++) f[$i] = i}
      NR==FNR && FNR>1 {sample_id=$(f["sample_id"]); call_rate=$(f["call_rate"]); baf_auto=$(f["baf_auto"])}
      NR==FNR && FNR>1 && (call_rate<~{call_rate_thr} || baf_auto>~{baf_auto_thr}) {xcl[sample_id]++}
      NR>FNR && FNR==1 {for (i=1; i<=NF; i++) g[$i] = i}
      NR>FNR && FNR>1 {sample_id=$(g["sample_id"]); chrom=$(g["chrom"]); beg=$(g[beg_pos]); end=$(g[end_pos]);
        len=$(g["length"]); p_arm=$(g["p_arm"]); q_arm=$(g["q_arm"]); bdev=$(g["bdev"]); rel_cov=$(g["rel_cov"]);
        lod_baf_phase=$(g["lod_baf_phase"]); type=$(g["type"]); if (lod_baf_phase=="nan") lod_baf_phase=0}
      NR>FNR && FNR>1 && !(sample_id in xcl) && rel_cov>0.5 && type!~"^CNP" &&~{if do_not_plot_sex_chromosomes
        then "\n    chrom!=\"X\" && chrom!=\"chrX\" && chrom!=\"Y\" && chrom!=\"chrY\" &&" else ""}
        ( len>5e6 + 5e6 * (p_arm!="N" && q_arm!="N") ||
          len>5e5 && (bdev<1/10 && rel_cov<2.5) && lod_baf_phase>10 ||
          rel_cov<2.1 && lod_baf_phase>10 ) {print sample_id,chrom,beg,end}' \
      "~{basename(stats_tsv)}" "~{basename(calls_tsv)}" > "~{basename(calls_tsv, ".tsv")}.coords.tsv"
    while read sample_id chrom beg_pos end_pos; do
      mocha_plot.R \
        --cytoband "~{basename(cyto_file)}" \
        ~{if wgs then "--wgs" else ""} \
        --mocha \
        --stats "~{basename(stats_tsv)}" \
        --png pngs/${sample_id//[:\\\/ $'\t']/_}.${chrom}_${beg_pos}_$end_pos.png \
        --vcf "~{basename(vcf_file)}" \
        --samples $sample_id \
        --regions $chrom:$beg_pos-$end_pos \
        ~{mocha_plot_extra_args}
      echo pngs/${sample_id//[:\\\/ $'\t']/_}.${chrom}_${beg_pos}_$end_pos.png
    done < "~{basename(calls_tsv, ".tsv")}.coords.tsv"
    rm "~{basename(calls_tsv, ".tsv")}.coords.tsv"
    rm "~{basename(vcf_file)}"
    rm "~{basename(vcf_idx)}"
    rm "~{basename(calls_tsv)}"
    rm "~{basename(stats_tsv)}"
    rm "~{basename(cyto_file)}"
  >>>

  output {
    Directory pngs = "pngs"
    Array[File] png_files = read_lines(stdout())
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}

task mocha_summary {
  input {
    File calls_tsv
    File stats_tsv
    Array[File]+ ucsc_beds
    File cyto_file
    String filebase
    Float call_rate_thr = 0.97
    Float baf_auto_thr = 0.03

    String docker
    Int cpu = 1
    Int disk_size = 10
    Float memory = 3.5
    Int preemptible = 1
    Int maxRetries = 0
  }

  command <<<
    set -euo pipefail
    ucsc_files=~{write_lines(ucsc_beds)}
    mv "~{calls_tsv}" .
    mv "~{stats_tsv}" .
    mv "~{cyto_file}" .
    cat $ucsc_files | tr '\n' '\0' | xargs -0 mv -t .
    sed -i 's/^.*\///' $ucsc_files
    summary_plot.R \
      --stats "~{basename(stats_tsv)}" \
      --calls "~{basename(calls_tsv)}" \
      --call-rate-thr ~{call_rate_thr} \
      --baf-auto-thr ~{baf_auto_thr} \
      --pdf "~{filebase}.summary.pdf"
    awk -F "\t" 'NR==FNR && FNR==1 {for (i=1; i<=NF; i++) f[$i] = i}
      NR==FNR && FNR>1 {sample_id=$(f["sample_id"]); call_rate=$(f["call_rate"]); baf_auto=$(f["baf_auto"])}
      NR==FNR && FNR>1 && (call_rate<~{call_rate_thr} || baf_auto>~{baf_auto_thr}) {xcl[sample_id]++}
      NR>FNR && FNR==1 {for (i=1; i<=NF; i++) g[$i] = i; print}
      NR>FNR && FNR>1 {sample_id=$(g["sample_id"]); len=$(g["length"]); p_arm=$(g["p_arm"]); q_arm=$(g["q_arm"]);
        bdev=$(g["bdev"]); rel_cov=$(g["rel_cov"]); lod_baf_phase=$(g["lod_baf_phase"]); type=$(g["type"]);
        if (lod_baf_phase=="nan") lod_baf_phase=0}
      NR>FNR && FNR>1 && !(sample_id in xcl) && rel_cov>0.5 && type!~"^CNP" &&
        ( len>5e6 + 5e6 * (p_arm!="N" && q_arm!="N") ||
          len>5e5 && (bdev<1/10 && rel_cov<2.5) && lod_baf_phase>10 ||
          rel_cov<2.1 && lod_baf_phase>10 )' \
      "~{basename(stats_tsv)}" "~{basename(calls_tsv)}" > "~{basename(calls_tsv, ".tsv")}.filtered.tsv"
    pileup_plot.R \
      --cytoband "~{basename(cyto_file)}" \
      --stats "~{basename(stats_tsv)}" \
      --calls "~{basename(calls_tsv, ".tsv")}.filtered.tsv" \
      --call-rate-thr ~{call_rate_thr} \
      --baf-auto-thr ~{baf_auto_thr} \
      --pdf "~{filebase}.pileup.pdf"
    rm "~{basename(calls_tsv, ".tsv")}.filtered.tsv"
    cat $ucsc_files | tr '\n' '\0' | xargs -0 cat | \
      awk '{if ($0~"^track") track=$0; else bed[track]=bed[track]$0"\n"}
      END {for (track in bed) printf track"\n"bed[track]}' > "~{filebase}.ucsc.bed"
    rm "~{basename(calls_tsv)}"
    rm "~{basename(stats_tsv)}"
    rm "~{basename(cyto_file)}"
    cat $ucsc_files | tr '\n' '\0' | xargs -0 rm
  >>>

  output {
    File ucsc_bed = filebase + ".ucsc.bed"
    File summary_pdf = filebase + ".summary.pdf"
    File pileup_pdf = filebase + ".pileup.pdf"
  }

  runtime {
    docker: docker
    cpu: cpu
    disks: "local-disk " + disk_size + " HDD"
    memory: memory + " GiB"
    preemptible: preemptible
    maxRetries: maxRetries
  }
}