#!/usr/bin/perl
unless(exists $ARGV[0])
{
	print "==============================================================================\n";
	print "                         ROS FRAGMENT SEARCH\n";
	print "==============================================================================\n";
	print " -fas     fasta of sequence (hhblits will be used to build an alignment)\n";
	print " -msa     fasta of alignment (input alignment will be used)\n";
	print "          -fas or -msa [REQUIRED]\n";
	print "=============================================================================\n";
	die();
}
while ($arg = shift())
{
	if ($arg =~ m/^-/)
	{
		$arg = lc($arg);
		if ($arg eq "-fas")	{$FAS = shift(); next;}
		if ($arg eq "-msa")	{$MSA = shift(); next;}
	}
}
my $MODE = "ERROR";
if(defined $FAS and -e $FAS){$MODE = "FAS";}
if(defined $MSA and -e $MSA){$MODE = "MSA";}
if($MODE eq "ERROR"){die("-fas '$FAS' or -msa '$MSA' not found");}


my %AA_frq = ('A',78.05,'R',51.29,'N',44.87,'D',53.64,'C',19.25,'E',62.95,'Q',42.64,'G',73.77,'H',21.99,'I',51.42,'L',90.19,'K',57.44,'M',22.43,'F',38.56,'P',52.03,'S',71.20,'T',58.41,'W',13.30,'Y',32.16,'V',64.41);
my @AA_order = ("A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V");

# Where the NCBI programs have been installed
my $ncbidir = "/home/work/tools/blast-2.2.26/bin";

# Where the PSIPRED V2 programs have been installed
my $execdir = "/home/work/tools/psipred/bin";

# Where the PSIPRED V2 data files have been installed
my $datadir = "/home/work/tools/psipred/data";

#system("export HHLIB=/work/krypton/lib/hh");
#my $hhblits = "/work/krypton/bin/hhblits -o /dev/null -d /lab/databases/hhsuite/uniprot/uniprot20_2015_06/uniprot20_2015_06 -cpu 20";
#my $hhsearch = "/work/krypton/bin/hhsearch";
#my $hhmake = "/work/krypton/bin/hhmake";

my $i_a3m;
if($MODE eq "FAS")
{
	$i_a3m = substr($FAS,0,-3)."a3m";
	unless(-e $i_a3m){system("$hhblits -i $FAS -oa3m $i_a3m");}
}
else{$i_a3m = $MSA;}

my $header;
my $sequence;
my $n = 0;
open(A3M,$i_a3m);
while($line = <A3M> and $n <= 1)
{
	chomp($line);
	if(substr($line,0,1) eq ">")
	{
		if($n == 0){$header = $line;}
		$n++;
	}
	else{$sequence .= $line;}
}
close(A3M);
unless(-e "$i_a3m.fasta")
{
	open(FAS,">$i_a3m.fasta");print FAS "$header\n$sequence\n";close(FAS);
}

my $ss2 = "$i_a3m.csb.hhblits.ss2";
my $chk = "$i_a3m.checkpoint";

unless(-e $ss2)
{
	system("/home/work/tools/csblast/bin/csbuild -i $i_a3m -I a3m -D /home/work/tools/csblast/data/K4000.crf -o $i_a3m.chk -O chk");
	system("echo '$i_a3m.chk' > $i_a3m.pn");
	system("echo '$i_a3m.fasta' > $i_a3m.sn");

	system("$ncbidir/makemat -P $i_a3m");
	system("$execdir/psipred $i_a3m.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $i_a3m.csb.hhblits.ss");
	system("$execdir/psipass2 $datadir/weights_p2.dat 1 1.0 1.0 $i_a3m.csb.hhblits.ss2 $i_a3m.csb.hhblits.ss > $i_a3m.csb.hhblits.horiz");

	system("rm $i_a3m.pn $i_a3m.sn $i_a3m.mtx $i_a3m.mn $i_a3m.aux");

	my @checkpoint_matrix;
	@checkpoint_matrix = parse_checkpoint_file("$i_a3m.chk");
	#@checkpoint_matrix = finish_checkpoint_matrix( $sequence, @checkpoint_matrix );
	write_checkpoint_file( "$i_a3m.checkpoint", $sequence, @checkpoint_matrix );
}


my $fp = "fragment_picker.default.linuxgccrelease";
$fp .= " -in:file:vall /home/work/tools/pick_frag/id30_d110719_sprf-7m-50f_filt.vall";
$fp .= " -frags:n_candidates 1000 -frags:n_frags 200";
$fp .= " -frags:write_ca_coordinates";
$fp .= " -in:file:fasta $i_a3m.fasta";
$fp .= " -in:file:checkpoint $chk";
$fp .= " -frags:ss_pred $ss2 psipred";
$fp .= " -out:file:frag_prefix $i_a3m";
if($nohoms){$fp .= " -frags:denied_pdb $i_a3m.nohoms.homolog";}
for my $frag (3,9)
{
	unless(-e "$i_a3m.200.$frag\mers")
	{
		my $FP = $fp;
		$FP .= " -frags:scoring:config /home/work/tools/pick_frag/t000__scores$frag.cfg";
		$FP .= " -frags:describe_fragments $i_a3m.$frag.score";
		$FP .= " -frags:frag_sizes $frag";
		system("($FP; gzip $i_a3m.$frag.score.200.$frag\mers) ");
	}
}

sub parse_checkpoint_file
{
    my $filename = shift;
    my $buf;
    my $seqlen;
    my $seqstr;
    my $i;
    my $j;
    my @aa_order = split( //, 'ACDEFGHIKLMNPQRSTVWY' );
    my @altschul_mapping =
      ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18 );
    my @w;
    my @output;

    open( INPUT, $filename ) or die("Couldn't open $filename for reading.\n");

    read( INPUT, $buf, 4 ) or die("Couldn't read $filename!\n");
    $seqlen = unpack( "i", $buf );

    print_debug("Sequence length: $seqlen");

    read( INPUT, $buf, $seqlen ) or die("Premature end: $filename.\n");
    $seqstr = unpack( "a$seqlen", $buf );

    for ( $i = 0 ; $i < $seqlen ; ++$i ) {
        read( INPUT, $buf, 160 ) or die("Premature end: $filename, line: $i\n");
        @w = unpack( "d20", $buf );

        for ( $j = 0 ; $j < 20 ; ++$j ) {
            $output[$i][$j] = $w[ $altschul_mapping[$j] ];
        }
    }

    return @output;
}

sub write_checkpoint_file
{
    my ($filename, $sequence, @matrix) = @_;
    my $row;
    my $col;
    my @seq = split( //, $sequence );

    open( OUTPUT, ">$filename" );

    die("Length mismatch between sequence and checkpoint matrix!\n")
      unless ( length($sequence) == scalar(@matrix) );

    print OUTPUT scalar(@matrix), "\n";

    for ( $row = 0 ; $row <= $#matrix ; ++$row ) {
        print OUTPUT "$seq[$row] ";
        for ( $col = 0 ; $col < 20 ; ++$col ) {
            printf OUTPUT "%6.4f ", $matrix[$row][$col];
        }
        print OUTPUT "\n";
    }

    print OUTPUT "END";

    close(OUTPUT);
}

sub finish_checkpoint_matrix {
    my ( $s, @matrix ) = @_;
    my @sequence = split( //, $s );
    my $i;
    my $j;
    my $sum;
    my @words;
    my @b62;
    my @blos_aa =
      ( 0, 14, 11, 2, 1, 13, 3, 5, 6, 7, 9, 8, 10, 4, 12, 15, 16, 18, 19, 17 );

    my %aaNum = (
        A => 0,
        C => 1,
        D => 2,
        E => 3,
        F => 4,
        G => 5,
        H => 6,
        I => 7,
        K => 8,
        L => 9,
        M => 10,
        N => 11,
        P => 12,
        Q => 13,
        R => 14,
        S => 15,
        T => 16,
        V => 17,
        W => 18,
        Y => 19,
        X => 0
    );

    ( length($s) == scalar(@matrix) )
      or die("Length mismatch between sequence and checkpoint file!\n");

    my $blosum62 = <<BLOSUM;
#  BLOSUM Clustered Target Frequencies=qij
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
   A      R      N      D      C      Q      E      G      H      I      L      K      M      F      P      S      T      W      Y      V
0.0215
0.0023 0.0178
0.0019 0.0020 0.0141
0.0022 0.0016 0.0037 0.0213
0.0016 0.0004 0.0004 0.0004 0.0119
0.0019 0.0025 0.0015 0.0016 0.0003 0.0073
0.0030 0.0027 0.0022 0.0049 0.0004 0.0035 0.0161
0.0058 0.0017 0.0029 0.0025 0.0008 0.0014 0.0019 0.0378
0.0011 0.0012 0.0014 0.0010 0.0002 0.0010 0.0014 0.0010 0.0093
0.0032 0.0012 0.0010 0.0012 0.0011 0.0009 0.0012 0.0014 0.0006 0.0184
0.0044 0.0024 0.0014 0.0015 0.0016 0.0016 0.0020 0.0021 0.0010 0.0114 0.0371
0.0033 0.0062 0.0024 0.0024 0.0005 0.0031 0.0041 0.0025 0.0012 0.0016 0.0025 0.0161
0.0013 0.0008 0.0005 0.0005 0.0004 0.0007 0.0007 0.0007 0.0004 0.0025 0.0049 0.0009 0.0040
0.0016 0.0009 0.0008 0.0008 0.0005 0.0005 0.0009 0.0012 0.0008 0.0030 0.0054 0.0009 0.0012 0.0183
0.0022 0.0010 0.0009 0.0012 0.0004 0.0008 0.0014 0.0014 0.0005 0.0010 0.0014 0.0016 0.0004 0.0005 0.0191
0.0063 0.0023 0.0031 0.0028 0.0010 0.0019 0.0030 0.0038 0.0011 0.0017 0.0024 0.0031 0.0009 0.0012 0.0017 0.0126
0.0037 0.0018 0.0022 0.0019 0.0009 0.0014 0.0020 0.0022 0.0007 0.0027 0.0033 0.0023 0.0010 0.0012 0.0014 0.0047 0.0125
0.0004 0.0003 0.0002 0.0002 0.0001 0.0002 0.0003 0.0004 0.0002 0.0004 0.0007 0.0003 0.0002 0.0008 0.0001 0.0003 0.0003 0.0065
0.0013 0.0009 0.0007 0.0006 0.0003 0.0007 0.0009 0.0008 0.0015 0.0014 0.0022 0.0010 0.0006 0.0042 0.0005 0.0010 0.0009 0.0009 0.0102
0.0051 0.0016 0.0012 0.0013 0.0014 0.0012 0.0017 0.0018 0.0006 0.0120 0.0095 0.0019 0.0023 0.0026 0.0012 0.0024 0.0036 0.0004 0.0015 0.0196
BLOSUM

    $i = 0;
    my @blosum62 = split( /\n/, $blosum62 );

    # read/build the blosum matrix
    foreach my $blosumline (@blosum62) {
        next if ( $blosumline !~ /^\d/ );
        chomp $blosumline;
        @words = split( /\s/, $blosumline );

        for ( $j = 0 ; $j <= $#words ; ++$j ) {
            $b62[ $blos_aa[$i] ][ $blos_aa[$j] ] = $words[$j];
            $b62[ $blos_aa[$j] ][ $blos_aa[$i] ] = $words[$j];
        }

        ++$i;
    }

    # normalize the blosum matrix so that each row sums to 1.0
    for ( $i = 0 ; $i < 20 ; ++$i ) {
        $sum = 0.0;

        for ( $j = 0 ; $j < 20 ; ++$j ) {
            $sum += $b62[$i][$j];
        }

        for ( $j = 0 ; $j < 20 ; ++$j ) {
            $b62[$i][$j] = ( $b62[$i][$j] / $sum );
        }
    }

    # substitute appropriate blosum row for 0 rows
    for ( $i = 0 ; $i <= $#matrix ; ++$i ) {
        $sum = 0;

        for ( $j = 0 ; $j < 20 ; ++$j ) {
            $sum += $matrix[$i][$j];
        }

        if ( $sum == 0 ) {
            for ( $j = 0 ; $j < 20 ; ++$j ) {
                $matrix[$i][$j] = $b62[ $aaNum{ $sequence[$i] } ][$j];
            }
        }
    }

    return @matrix;
}


sub print_debug
{
	foreach my $line (@_)
	{
		print "$line\n";
	}
}

sub SR{my $str = $_[0]; $str =~ s/ //g; return $str;}

#    1 L   -5 -5 -6 -7 -5 -5 -6 -7 -3 -2  5 -5  1  1 -6 -5 -5  2  7 -3    0   0   0   0   0   0   0   0   0   0  56   0   3   5   0   0   0   2  35   0  1.48 0.56
#01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
#0000000000111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999


sub exclude_outn
{
	my ($runid) = @_;
	my @hits;
	my $hit;
	my $hit_pdb;
	my $hit_chain;
	my %uniq_hits;

	open( OUTN, "$runid.outn" );@hits = grep( /^>/, <OUTN> );close(OUTN);
	foreach $hit (@hits) {$uniq_hits{$hit} = 1;}

	open( EXCL, ">$runid.homolog" );
	foreach $hit ( sort keys %uniq_hits )
	{
		($hit_pdb) = $hit =~ /^>\s*(\w+)/;
		$hit_chain = substr( $hit_pdb, 4, 1 );
		$hit_pdb   = substr( $hit_pdb, 0, 4 );

		$hit_pdb =~ tr/[A-Z]/[a-z]/;
		$hit_chain = '_' if ( $hit_chain eq ' ' );
		$hit_chain = '_' if ( $hit_chain eq '0' );

		print EXCL "$hit_pdb$hit_chain\n";
	}
	close(EXCL);
}
