#!/usr/bin/perl -w 

use strict;

if ($#ARGV ne 3) {
  print "perl $0 
    <protein sequences(../03_BLAST/Xanthomonas.fasta)>
    <transposase file (../05_TransposaseDB/Xanthomonas.faa.transposase)>
    <BLAST result(../03_BLAST/Xanthomonas.faa_transposase.blastp.e20)> 
    <the_orthomcl_group_file(../04_orthomcl/Xanthomonas_groups.txt)>\n";
  exit;
}

my ($seq_file, $transposase_file, $BLAST_res, $grp_file) = @ARGV;
my $cutoff_len   = 60;
my $cutoff_ratio = 0.5;

my ($string, $title, @tmp);

# LOAD SEQ
my %LENGTH = ();
my %PREFIX = ();
open(IN, $seq_file)||die ":$!";
while (<IN>) {
  chomp;
  if (/^\>(\S+)/) {
    $string = $1;
    @tmp = split(/\|/, $string);
    $PREFIX{$tmp[1]}= $tmp[0];
  } else {
    if (defined $LENGTH{$tmp[1]}) {
      $LENGTH{$tmp[1]} += length($_);
    } else {
      $LENGTH{$tmp[1]} = length($_);
    }
  }
}
close(IN)||die ":$!";

# LOAD transposase
my %TRANS_ID = ();
open(IN,$transposase_file)||die ":$!";
while (<IN>) {
  chomp;
  if (/^\>(\S+)/) {
    $TRANS_ID{$1} = 1;
  }
}
close(IN)||die ":$!";

# LOAD BLAST RES
my %IS = ();
open(IN,$BLAST_res)||die ":$!";
while (<IN>) {
  chomp;
  if (/^(\S+)/) {
    $IS{$1} = 1;
  }
}
close(IN)||die ":$!";

# Remove all proteins < 60 AAs
# For IS, if the percentage of proteins in this row >= 50%, remove this row. otherwise, keep this row.

my (@array, @res);
my $guard;
my $IS_num;
# LOAD GROUP FILE
my %HIT = ();
open(IN,$grp_file)||die ":$!";
open(OUT, ">$grp_file.FINAL_GRP")||die ":$!";
while (<IN>) {
  chomp;
  if (/^(\S+)\s+/) {
    $title  = $1;
    $string = $';
    @array  = split(/\s+/,$string);
    @res    = ();
    $guard  = 0;
    $IS_num = 0;
    for (my $i=0; $i<=$#array; $i++) {
      @tmp = split(/\|/, $array[$i]);
      $HIT{$tmp[1]} = 1;
      if ($LENGTH{$tmp[1]} < $cutoff_len) {
      } else {
        push(@res, $array[$i]);
        $guard++;
        if ((defined $IS{$tmp[1]})||(defined $TRANS_ID{$tmp[1]})) {
          $IS_num++;
        }
      }
    }
    # PRINT
    if ($#res < 0) {
    } elsif ($IS_num/$guard >= $cutoff_ratio) {
    } else {
      print OUT $title." ".join(" ", @res)."\n";
    }
  }
}

my $key;
foreach $key (sort keys(%LENGTH)) {
  if (defined $HIT{$key}) {
  } elsif ((defined $IS{$key})||(defined $TRANS_ID{$key})) {
  } elsif ($LENGTH{$key} >= $cutoff_len) {
    print OUT "Xan_SING: ".$PREFIX{$key}."|".$key."\n";
  }
}
close(OUT)||die ":$!";
close(IN)||die ":$!";
