#! /usr/bin/perl
# $Id: gmap_build.pl.in 110636 2013-10-10 00:22:02Z twu $

use warnings;	

$gmapdb = "/tmp/RtmpDO8InZ/Rbuild176b5772603050/gmapR/src/../inst/usr/share";
$package_version = "2013-11-01";

use File::Copy;	
use Getopt::Long;

Getopt::Long::Configure(qw(no_auto_abbrev no_ignore_case_always));


# Default values
$bindir = "/tmp/RtmpDO8InZ/Rbuild176b5772603050/gmapR/src/../inst/usr/bin";
$builddir = ".";
$sampling = 3;
$sleeptime = 2;

GetOptions(
    '0|no-sarray' => \$skip_sarray_p, # skip suffix array

    'B=s' => \$bindir,		# binary directory
    'T=s' => \$builddir,	# temporary build directory

    'D|dir=s' => \$destdir,	# destination directory
    'd|db=s' => \$dbname,	# genome name
    'n|names=s' => \$chrnamefile,   # substitute chromosomal names

    'M|mdfile=s' => \$mdfile,	# NCBI MD file

    'z|compression=s' => \$compression_types, # compression types
    'k|kmer=s' => \$kmersize, # k-mer size for genomic index (allowed: 16 or less)
    'b|basesize=s' => \$basesize, # offsetscomp basesize
    'q=s' => \$sampling,	   # sampling interval for genome (default: 3)

    's|sort=s' => \$sorting,	# Sorting
    'g|gunzip' => \$gunzipp,	# gunzip files
    'w=s' => \$sleeptime, # waits (sleeps) this many seconds between steps.  Useful if there is a delay in the filesystem.

    'c|circular=s' => \$circular,  # Circular chromosomes

    'e|nmessages=s' => \$nmessages  # Max number of warnings or messages to print
    );



if (defined($compression_types)) {
    $compression_flag = "-z $compression_types";
} else {
    $compression_flag = "";
}

if (!defined($kmersize)) {
    print STDERR "-k flag not specified, so building with default 15-mers\n";
    $kmersize = 15;
}

if (defined($compression_types)) {
    @types = split ",",$compression_types;
    foreach $type (@types) {
	if ($type eq "none") {
	    if (!defined($basesize)) {
		print STDERR "-b flag not specified, but since compression type is none, setting base size to be the k-mer size\n";
		$basesize = $kmersize;
	    } elsif ($basesize != $kmersize) {
		print STDERR "Since compression type is none, setting base size to be the k-mer size\n";
		$basesize = $kmersize;
	    }
	}
    }
}

if (!defined($basesize)) {
    if ($kmersize == 15) {
	print STDERR "-b flag not specified, so building with default base size of 12\n";
	$basesize = 12;
    } else {
	print STDERR "-k flag specified (not as 15), but not -b, so building with base size == kmer size\n";
	$basesize = $kmersize;
    }
}


if (!defined($dbname)) {
    print_usage();
    die "Must specify genome database name with -d flag.";
} elsif ($dbname =~ /(\S+)\/(\S+)/) {
    $dbdir = $1;
    $dbname = $2;
    if (defined($destdir) && $destdir =~ /\S/) {
	$destdir = $destdir . "/" . $dbname;
    } else {
	$destdir = $dbdir;
    }
}

$dbname =~ s/\/$//;	# In case user gives -d argument with trailing slash

if (!defined($destdir) || $destdir !~ /\S/) {
    print STDERR "Destination directory not defined with -D flag, so writing to $gmapdb\n";
    $destdir = $gmapdb;
}

if (defined($sorting)) {
    $chr_order_flag = "-s $sorting";
} else {
    # Default is to order genomes
    print STDERR "Sorting chromosomes in chrom order.  To turn off or sort other ways, use the -s flag.\n";
    $chr_order_flag = "";
}

if (defined($gunzipp)) {
    $gunzip_flag = "-g";
} else {
    $gunzip_flag = "";
}

if (defined($circular)) {
    $circular_flag = "-c $circular";
} else {
    $circular_flag = "";
}

if (defined($nmessages)) {
    $nmessages_flag = "-e $nmessages";
} else {
    $nmessages_flag = "";
}

if (defined($skip_sarray_p)) {
    $sarrayp = 0;
} else {
    $sarrayp = 1;
}



@quoted = ();
foreach $fasta (@ARGV) {
    push @quoted,"\"$fasta\"";
}
$genome_fasta = join(" ",@quoted);



#####################################################################################

create_coords();
create_genome_version();
make_contig();
compress_genome();
unshuffle_genome();
create_index_offsets();
create_index_positions();
if ($sarrayp == 1) {
    make_suffix_array();
}
install_db();

exit;


#####################################################################################


sub create_coords {
    if (defined($mdfile)) {
	# MD file cannot specify that a chromosome is circular
	$cmd = "$bindir/md_coords -o $builddir/$dbname.coords $mdfile";
    } elsif (defined($chrnamefile)) {
	$cmd = "$bindir/fa_coords $gunzip_flag $circular_flag -n $chrnamefile -o $builddir/$dbname.coords $genome_fasta";
    } else {
	$cmd = "$bindir/fa_coords $gunzip_flag $circular_flag -o $builddir/$dbname.coords $genome_fasta";
    }
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub check_coords_exists {
# file exists, and is not empty
    unless (-s "$builddir/$dbname.coords") {
	die "ERROR: $builddir/$dbname.coords not found.\n";
    }
    return;
}

sub create_genome_version {
    open GENOMEVERSIONFILE, ">$builddir/$dbname.version" or die $!;
    print GENOMEVERSIONFILE "$dbname\n";
    close GENOMEVERSIONFILE or die $!;
    sleep($sleeptime);
    return;
}

sub make_contig {
    check_coords_exists();
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex $nmessages_flag -d $dbname -D $builddir -A $chr_order_flag";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub compress_genome {
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex $nmessages_flag -d $dbname -F $builddir -D $builddir -G";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub unshuffle_genome {
    $cmd = "cat $builddir/$dbname.genomecomp | $bindir/gmapindex -d $dbname -U > $builddir/$dbname.genomebits";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub full_ASCII_genome {
    check_coords_exists();
    make_contig();
	
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex $nmessages_flag -d $dbname -F $builddir -D $builddir -l -G";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub create_index_offsets {
    $cmd = "cat $builddir/$dbname.genomecomp | $bindir/gmapindex -b $basesize -k $kmersize -q $sampling $nmessages_flag -d $dbname -F $builddir -D $builddir -O $compression_flag";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub create_index_positions {
    $cmd = "cat $builddir/$dbname.genomecomp | $bindir/gmapindex -b $basesize -k $kmersize -q $sampling $nmessages_flag -d $dbname -F $builddir -D $builddir -P";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub make_suffix_array {
    $cmd = "$bindir/gmapindex -d $dbname -F $builddir -D $builddir -S";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub install_db {
    my @suffixes = (
	"chromosome", 
	"chromosome.iit", 
	"chrsubset", 
	"contig", 
	"contig.iit", 
	"genomecomp", 
	"genomebits", 
	"version");

    if ($sarrayp == 1) {
	push @suffixes,"sarray";
	push @suffixes,"saindex";
	push @suffixes,"salcp";
	push @suffixes,"salcpptrs";
	push @suffixes,"salcpcomp";
    }
	
    if ($kmersize > $basesize) {
	push @suffixes,sprintf "ref%02d%02d%dbitpackptrs",$kmersize-3,$kmersize,$sampling;
	push @suffixes,sprintf "ref%02d%02d%dbitpackcomp",$kmersize-3,$kmersize,$sampling;
	push @suffixes,sprintf "ref%02d%02d%dgammaptrs",$basesize,$kmersize,$sampling;
	push @suffixes,sprintf "ref%02d%02d%doffsetscomp",$basesize,$kmersize,$sampling;
    } else {
	$suffix = sprintf "ref%02d%doffsets",$kmersize,$sampling;
	push @suffixes,$suffix;
	
	# For backward compatibility with versions before 2013-07, which expect to see offsetscomp
	$symlink{$suffix} = sprintf "ref%02d%02d%doffsetscomp",$kmersize,$kmersize,$sampling;
    }
    push @suffixes,sprintf "ref%02d%dpositions",$kmersize,$sampling;

    print STDERR "Copying files to directory $destdir/$dbname\n";
    system("mkdir -p \"$destdir/$dbname\"");
    system("mkdir -p \"$destdir/$dbname/$dbname.maps\"");
    system("chmod 755 \"$destdir/$dbname/$dbname.maps\"");
    foreach $suffix (@suffixes) {
	if (-e "$builddir/$dbname.$suffix") {
	    system("mv \"$builddir/$dbname.$suffix\" \"$destdir/$dbname/$dbname.$suffix\"");
	    system("chmod 644 \"$destdir/$dbname/$dbname.$suffix\"");
	    if (defined($symlink{$suffix})) {
		system("ln -s \"$dbname.$suffix\" \"$destdir/$dbname/$dbname.$symlink{$suffix}\"");
	    }
	}
    }

    system("rm -f \"$builddir/$dbname.coords\"");
    return;
}



sub print_usage {
  print <<TEXT1;

gmap_build: Builds a gmap database for a genome to be used by GMAP or GSNAP.
Part of GMAP package, version $package_version.

A simplified alternative to using the program gmap_setup, which creates a Makefile.

Usage: gmap_build [options...] -d <genomename> <fasta_files>

Options:
    -D, --dir=STRING          Destination directory for installation (defaults to gmapdb directory specified at configure time)
    -d, --db=STRING           Genome name

    -n, --names=STRING        Substitute names for chromosomes, provided in a file.  The file should have one line
                                for each chromosome name to be changed, with the original FASTA name in column 1 and
                                the desired chromosome name in column 2.  This provides an easy way to change the
                                names of chromosomes, for example, to add or remove the "chr" prefix.

    -T STRING                 Temporary build directory (may need to specify if you run out of space in your current directory)

    -M, --mdflag=STRING       Use MD file from NCBI for mapping contigs to chromosomal coordinates

    -z, --compression=STRING  Use given compression types (separated by commas; default is bitpack,gamma)
                                bitpack - optimized for modern computers with SIMD instructions (recommended)
                                gamma - old implementation.  Needed only for backward compatibility with old versions
                                all - create all available compression types, currently bitpack and gamma
                                none - do not compress offset files

    -k, --kmer=INT            k-mer value for genomic index (allowed: 16 or less, default is 15)
    -b, --basesize=INT        Basesize for offsetscomp (if kmer chosen and not 15, default is kmer; else default is 12)
    -q INT                    sampling interval for genomoe (allowed: 1-3, default 3)

    -s, --sort=STRING         Sort chromosomes using given method:
			        none - use chromosomes as found in FASTA file(s)
			        alpha - sort chromosomes alphabetically (chr10 before chr 1)
			        numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY
			        chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU

    -g, --gunzip              Files are gzipped, so need to gunzip each file first
    -w INT                    Wait (sleep) this many seconds after each step (default 2)

    -c, --circular=STRING     Circular chromosomes (either a list of chromosomes separated by a comma, or
                                a filename containing circular chromosomes, one per line)

    -e, --nmessages=INT       Maximum number of messages (warnings, contig reports) to report (default 50)

    --no-sarray               Skip build of suffix array
TEXT1
  return;
}

