#!/bin/bash
#
# READ_ME file (08 Dec 2015)
# 
# 1000 Genomes Project Phase 3 data release (version 5a) in VCF format for use with Beagle version 4.x
#    Data Source: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/*
#
# NOTES:
# 
# 1) Markers with <5 copies of the reference allele or <5 copies of the non-reference alleles have been excluded.
#
# 2) Structural variants have been excluded.
#
# 3) All non-unique identifiers in the ID column are removed from the ID column
#
# 4) Additional marker filtering may be performed using the gtstats.jar and filterlines.jar utilities
#
# 5) Sample information is in files: 
#      integrated_call_samples.20130502.ALL.ped
#      integrated_call_samples_v3.20130502.ALL.panel
#
# 6) Male haploid chromosome X genotypes are encoded as diploid homozygous genotypes.
#
############################################################################
#  The following shell script was used to create the files in this folder  #
############################################################################
#
gts="java -ea -jar gtstats.jar"
fl="java -ea -jar filterlines.jar"
rmids="java -ea -jar remove.ids.jar"
simplify="java -ea -jar simplify-vcf.jar"
min_minor="5"

src="../phase3.v5a/"
mkdir ${src}
cd ${src}
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/*
cd -

for chr in $(seq 1 21); do
  echo "chr${chr}"
  in="${src}ALL.chr${chr}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
  vcf="chr${chr}.1kg.phase3.v5a.vcf.gz"
  excl="chr${chr}.1kg.phase3.v5a.excl"

  zcat ${in} | grep -v 'SVTYPE' | ${gts} | ${fl} \# -13 ${min_minor} 1000000 | cut -f2 > ${excl}
  zcat ${in} | grep -v 'SVTYPE' | grep -v '^#' | cut -f3 | tr ';' '\n' | sort | uniq -d > chr${chr}.dup.id

  # BEGIN: add 4 duplicate markers to exclusion list
  if [ ${chr} == "8" ]; then echo "89329148"; fi >> ${excl}
  if [ ${chr} == "12" ]; then echo "8400000"; fi >> ${excl}
  if [ ${chr} == "14" ]; then echo "21649957"; fi  >> ${excl}
  if [ ${chr} == "17" ]; then echo "1144632"; fi  >> ${excl}
  # END:  add 4 duplicate markers to exclusion list

  zcat ${in} | grep -v 'SVTYPE' | ${fl} \# \-2 ${excl} | ${simplify} | ${rmids} chr${chr}.dup.id | bgzip -c > ${vcf}
  tabix ${vcf}
done

chr="X"
in="${src}ALL.chr${chr}.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz"
vcf="chr${chr}.1kg.phase3.v5a.vcf.gz"
excl="chr${chr}.1kg.phase3.v5a.excl"

### sed command recodes male chrom X genotypes as homozygote diploid genotypes
### sed command makes temporary change to floating point numbers and diploid genotypes to permit use of word-boundary '\>'
cat_in=" zcat ${in} | grep -v 'SVTYPE' | sed \
-e 's/\t0\t\.\t/\t111\tPASS\t/' \
-e 's/0\tPASS/222\tPASS/' \
-e 's/\([0-9]\)\./\1WXYZ/g' \
-e 's/\([0-9]\)|\([0-9]\)/\1X\2/g' \
-e 's/\t\([0-9]\)\>/\t\1|\1/g' \
-e 's/\([0-9]\)WXYZ/\1./g' \
-e 's/\([0-9]\)X\([0-9]\)/\1|\2/g';"

echo ${chr}
  eval ${cat_in} | grep -v '^#' | cut -f3 | tr ';' '\n' | sort | uniq -d > chr${chr}.dup.id
  eval ${cat_in} | ${gts} | ${fl} \# -13 ${min_minor} 1000000 | cut -f2 > ${excl}

  # BEGIN: add duplicate markers to exclusion list
  if [ ${chr} == "X" ]; then echo "5375295"; fi >> ${excl}
  if [ ${chr} == "X" ]; then echo "32362662"; fi >> ${excl}
  if [ ${chr} == "X" ]; then echo "68204280"; fi >> ${excl}
  # END:  add duplicate markers to exclusion list

eval ${cat_in} | ${fl} \# \-2 ${excl} | ${simplify} | ${rmids} chr${chr}.dup.id | bgzip -c > ${vcf}
tabix ${vcf}
