#! /bin/bash
# The Broad Institute
# SOFTWARE COPYRIGHT NOTICE AGREEMENT
# This software and its documentation are copyright 2009 by the
# Broad Institute/Massachusetts Institute of Technology. All rights are
# reserved.

# This software is supplied without any warranty or guaranteed support
# whatsoever. Neither the Broad Institute nor MIT can be responsible for its
# use, misuse, or functionality.

# Run recalibrator on given BAM file, then re-index

PROGNAME=`basename $0`

set -e

function usage () {
    echo "USAGE: $PROGNAME <Bam-to-be-recalibrated>" >&2
    echo "Recalibrates the given BAM, replaces the original file with the recalibrated one, and then generates a BAM index." >&2
}

# Special not-checked-in recalibrator, which recomputes NM tag, deletes U2 tag,
# and removes UQ and NM from unmapped reads.
ONE_OFF_GATK=/seq/dirseq/alecw/RecalibrateBackProcess/GATK-Picard.jar
SAMTOOLS=/seq/software/picard/current/3rd_party/samtools/samtools

if (( $# != 1 ))
 then echo "ERROR: Incorrect number of arguments." >&2
      usage
      exit 1
fi

bam=`basename $1`
base=`basename $1 .bam`

if [ $bam == $base ]
then echo "Argument $1 does not end with .bam extension."
    usage
    exit 1
fi

bamdir=`dirname $1`

# Get reference fasta
refseq=`awk --field-separator = '/REFERENCE_SEQUENCE=/{print $2}' $bamdir/params.txt`

if [ ! -e "$refseq" ]
then echo "ERROR: refseq $refseq not found." >&2
exit 1
fi

# Check if there is a dbsnp file for the reference fasta.
dbsnp=`dirname $refseq`/`basename $refseq .fasta`.dbsnp

if [ -e "$dbsnp" ]
then dbsnp_args="-B dbsnp,PicardDbSNP,$dbsnp"
else dbsnp_args=
fi

# Extract flowcell and lane from the bam file name, in order to create
# name of recalibration table file.
fc_lane=`echo $bam | sed 's/.aligned\(.duplicates_marked\)\?.bam//'`

recal_data=$bamdir/$fc_lane.recal_data.csv

if [ -e "$recal_data" ]
then echo "ERROR: recal file $recal_data already exists." >&2
exit 1
fi

if [ -e "$recal_data.gz" ]
then echo "ERROR: recal file $recal_data.gz already exists." >&2
exit 1
fi

if [ ! -e $bamdir/finished.txt ]
then echo "ERROR: No finished file in $bamdir." >&2
exit 1
fi

set -x
# Generate recalibration table
java -Xmx4g -jar $ONE_OFF_GATK -R $refseq $dbsnp_args -I $1 \
-T CountCovariates  -cov ReadGroupCovariate -cov QualityScoreCovariate  -cov CycleCovariate  -cov DinucCovariate -cov TileCovariate \
-recalFile $recal_data --use_original_quals -fmq0

# Recalibrate into a new bam
newbam=$bamdir/$base.new.bam

java -Xmx2g -jar $ONE_OFF_GATK -R $refseq -I $1 -T TableRecalibration -recalFile $recal_data --use_original_quals \
-outputBam $newbam

# reindex

newindex=$bamdir/$base.new.bai
$SAMTOOLS index $newbam $newindex

# Run flagstat on old and new bams.  They should be identical.
new_flagstat_out=`mktemp $bamdir/new.flagstat.XXXXXXXXXX`
$SAMTOOLS flagstat $newbam > $new_flagstat_out

$SAMTOOLS flagstat $1 | cmp - $new_flagstat_out
rm $new_flagstat_out

gzip $recal_data

index=$bamdir/$base.bai

# Replace old bam with new bam
mv $newbam $1
mv $newindex $index


# Update finished.txt
touch $bamdir/finished.txt

ls -l $1 $index $bamdir/finished.txt


echo "Recalibration and re-index succeeded."
