
use LWP::Simple;

#                         #
#  INITIALIZE VARIABLES   #
#                         #

#Initalize the GRAIL website address where results will be 
#obtained from
$grailSite = 'http://www.broadinstitute.org/mpg/grail/results/';

#Initialize CutOffDates parameters that define the point
#at which GRAIL website became compatible with this script
$cutOffDate = 9;
$cutOffMonth = 2;
$cutOffYear = 2011;

#                         #
#  GET USER INPUT         #
#                         #

#get GRAIL ID from user
$id = $ARGV[0];
if(length($id)==0) {$id = "1297250676";}

if(exists($ARGV[1]))
   {$thinParam = $ARGV[1];}	
else
   {$thinParam = 0;}

#                         #
#  GET ONLINE INPUT       #
#                         #

print("Getting the results of GRAIL Run $id online...\n");

#Get results (out.html) url from the GRAIL site
my $url1 = $grailSite . $id . '_out.html';
my $content = get $url1;


#Get query regions from GRAIL
my $url2 = $grailSite  . $id . '_querySetRegions.html';
my $regions = get $url2;


#                             #
#  ERROR CHECK ONLINE INPUT   #
#                             #


#Make sure online files exist
die "Couldn't get $url.\n\n" unless defined $content;
die "Couldn't get $url.\n\n" unless defined $regions;

#Make sure that basic (_out.html) structure is intact
die "File error - please check $url1.\n\n"
  unless (($content =~ /\<B\>Analysis Summary\<\/b\>/) &&
	  ($content =~ /Genomic Regions/) &&
	  ($content =~ /REGION/));

#Make sure that basic (_querySetRegions.html) structure is intact
die "File error - please check $url2.\n\n"
  unless (($regions =~ /REGION\s+CHR\s+START\s+STOP\s+GENES/));

#Make sure seed and query regions were set equal in the run
die "Query and seed regions not equal.\n\n" 
  unless($content =~ /Queries and Seed Regions are set equal/);

#Make sure that the GRAIL query was since the cutOffDate

#get DATE of query
$content =~ /TIME\: (\d\d\d\d)-(\d\d)-(\d\d)/;
$year = $1; $month = $2+0; $date = $3+0;
#check date
die "GRAIL query must be after $cutOffYear-$cutOffMonth-$cutOffDate to be compatabile with this script.\n\n"
 unless( ($year>$cutOffYear) || (($year==$cutOffYear)&&($month>$cutOffMonth)) ||(($year==$cutOffYear)&&($month==$cutOffMonth)&& ($date>=$cutOffDate)));


#Make sure that the thinParameter is a floating point number
die "the second argument, the thinning parameter must be a positive floating point number"
  unless($thinParam =~ /^[+]?[0-9]*\.?[0-9]+$/);


#                                             #
#  EXTRACT INFORMATION ABOUT REGIONS & GENES  #
#                                             #


#go through each line of the region url content,
#one at a time, extract names of genes, and regions

@eachRegion = split(/\n/, $regions);
for($i=1; $i<@eachRegion; $i++)
  {

    @eachRegionLine = split(/\t/, $eachRegion[$i]);
    $regionName = $eachRegionLine[0];      #get the region name    
    $regionNameArray[$i-1] = $regionName;  #store region name in Array

    #Pull out gene names from each line

    $eachRegionLine[4] =~  s/<[^>]+>//g;   #remove url tags
    @geneList = split(/\s+/, $eachRegionLine[4]);
   
    $flag = 0; #flag variable - if a gene is mapping to multiple regions
               #  flag will become marked 1.

    for($j=0; $j<@geneList;$j++)
      {
	$geneName = $geneList[$j]; #Pull the actual gene Name
	
	#Error Check -check if the gene has already been mapped to some other region
	if( (exists($mapGeneToRegion{$geneName})) && ( $regionName ne $mapGeneToRegion{$geneName}))
	  {
	    print(stderr "The $geneName gene maps to multiple regions : $mapGeneToRegion{$geneName} and $regionName.\n\n");
	    $flag = 1;
	  }
	#If the gene is already mapped don't double count
	elsif(exists($mapGeneToRegion{$geneName}))
	  {  }
	#If the gene is not mapped to this reigon, note the region
	else
	  {
	    #store - map of genes to regions - in a hash
	    $mapGeneToRegion{$geneName} = $regionName; 

	    #store - list of genes for each region in an hash.
	    $regionNameHash{$regionName}{$geneName} =1;

	    #count - genes per region
	    if(exists( $regionGeneCount{$regionName})){
	      $regionGeneCount{$regionName} = $regionGeneCount{$regionName}+1;}
	    else{
	      $regionGeneCount{$regionName} = 1;} #if 1st gene, initilize 
	    
	  }	
      }	

    if($flag==1)
      {
	die("Genes mapping to multiple regions, please correct and rerun GRAIL query\n\n");
      }	

  }


#                                                        #
#  EXTRACT GENE & PAIRWISE GENE SIMILARITY INFORMATION   #
#                                                        #


#Go through the output file, line by line, pull key information

@contentArray = split(/\n/, $content); #split output content by line


$t = "";  #Intialize t; line at which we get gene specific informaiton
$GN = ""; # initialize Gene Count

#Go through each output line by line
for($i=0;$i<@contentArray;$i++)
  {
    
    #Pull out the number of genes in the analysis
    if($contentArray[$i] =~ /^A total of (\d+) genes/)
      {
	$GN = $1;	
      }	

    #go through each line until we find the "GENE"
    #header
    if($contentArray[$i] =~ /^GENE/)
      {
	$t = $i+1;
      }


    #if the line is past GENE, then get information
    if($i>$t && length($t))
      {
	$contentArray[$i] =~  s/<[^>]+>//g; #remove url tags
	$contentArray[$i] =~  s/\,//g; #remove commas

	#get each line that has information about a gene
	@geneInfo = split(/\s+/, $contentArray[$i]);

	$gene = $geneInfo[0];	#Get Gene name

	#ERROR CHECK - Make sure that the gene maps to a region
	print stderr "Warning : The $gene gene does not map to any gene region.\n\n" 
	  unless (exists($mapGeneToRegion{$gene})); 
	  
	$pvalues{$gene} = $geneInfo[1]; #Pull Gene pvalue

	
	for($j=2; $j<@geneInfo; $j++)
	  {
	    #pull out similar genes, and ranks
	    $geneInfo[$j] =~ /(.+)\((\d+)\)/; 
	    $gene2 = $1;
	    $rank = $2;

	    #ERROR CHECK - similar genes must map to regions also
	    print stderr  "WARNING: The $gene2 gene does not map to any gene region.\n\n" 
	      unless (exists($mapGeneToRegion{$gene}));

	    #Store rank similarity
	    $pairRanks{$gene}{$gene2} = $rank;
	    
	    #Convert The Rank into a connectivity SCORE

	    #Mult hyp test correction the percentile of the rank,
	    #for number of genes
	    $score{$gene}{$gene2} = 1- ( (1-$rank/$GN)**$regionGeneCount{$mapGeneToRegion{$gene2}} );

	    #Convert p-val of similarity into a thickness score

	    if(exists ($regionGeneCount{$mapGeneToRegion{$gene2}}))
	      {$score{$gene}{$gene2} = ($score{$gene}{$gene2}<0.1)   #Must be <0.1
		 *log($score{$gene}{$gene2}/0.1)*(-1/log(10));       #log scale it
	     }	
	  }	
      }	


  }




#                                   #
#  PRINT OUT DATA FILE              #
#  CONNECTS REGIONS TO GENES        #
#   ... AND HAS GENE SCORES (PVALS) #
#                                   #

print("Writing connection and data files for GRAIL Run $id...\n");
open(DATA, ">$id\_data");

#Iterate through each region
foreach $region (keys(%regionNameHash))
  {
    #pull genes from each region
    foreach $gene (keys(%{ $regionNameHash{$region} }))
      {
	if(!(exists($pvalues{$gene})))
	  {
	    $pvalues{$gene} = 1;
	  }	
	#Print - REGION, GENE, PVAL
	print(DATA "$region\t$gene\t$pvalues{$gene}\n");
      }
  }
close(DATA);

#                                          #
#  PRINT OUT CONNECTION FILE               #
#  CONNECTS GENES TO GENES                 #
#   ... AND STRENGHT OF CONNECTION (SCORE) #
#                                          #


open("CON", ">$id\_connections");

@allGenes = (keys(%pvalues)); #array of all genes

#Iterate through all gene pairs
for($i=0; ($i+1)<@allGenes; $i++)
  {
    $gene = $allGenes[$i];

    for($j=($i+1); $j<@allGenes; $j++)
      {
	$gene2 = $allGenes[$j];

	#If pair not scored, set to zero
	#Pair is ordered, so need to look at both
	#  ordered pairs
	if(!exists($score{$gene}{$gene2}))
	      {$score{$gene}{$gene2} = 0;}
	
	if(!exists($score{$gene2}{$gene}))
	  {$score{$gene2}{$gene} = 0 ;}
	  
	#average score for each ordered pair
	$num = ($score{$gene2}{$gene} + $score{$gene}{$gene2})/2;

	#print only non-zero pairs


	if($num>3){$num=3};
	#thin out $num
	$num = (1+$thinParam)*$num - $thinParam*3;
	
	if($num<0) {$num=0}

	#PRINT ONLY GENES THAT CAN BE MAPPED
	if( ($num>0) && (exists($mapGeneToRegion{$gene})) && (exists($mapGeneToRegion{$gene2})) ){
	  
	  print(CON "$gene\t$gene2\t$num\n");
	  
	}
      }
  }

