%       Copyright, February 2009, Ayellet Segre, Mark Daly, David Altshuler, Broad Institute, 7 Cambridge Center, Cambridge, MA 02142, USA
%
%       This code is part of the MAGENTA software package vs1.1 written in Matlab version R2009b, that tests for enrichment of multiple modest
%       genetic effects on a given complex disease or trait, in predefined sets of genes or loci. 
%       The main code from which all functions are run is called: Run_MAGENTA_vs1_May10_2010.m
%
%       This software accompanies the paper:
%       Ayellet V. Segre, DIAGRAM Consortium, MAGIC investigators, Leif Groop, Vamsi K. Mootha, Mark J. Daly, and David Altshuler. Common Inherited Variation in
%       Mitochondrial Genes is not Enriched for Associations with Type 2 Diabetes or Related Glycemic Traits. In revision at PLoS Genetics, May 2010.
%       
%       If your work benefits from the use of the MAGENTA  software package please cite the reference above.
%       
%       For questions or comments please contact Ayellet Segre at asegre@broadinstitute.org. You can check for updates at: http://www.broadinstitute.org/mpg/magenta
%       
%       Disclaimer: This software is distributed as is. The authors take no responsibility for any use or misuse.
%       
%       Last updated: May 10, 2010
% 

function [GSEA_p]=GSEA_GWAS_RankSum_092409(Uncorr_score,Corr_score,top_percen_cutoffs,num_rounds,find_gene_set,score_signif_direct,choose_unique_genes);

% find_gene_set = indeces of genes that belong to gene set
initial_num_genes_subset = length(find_gene_set);

clear b2
b2=find_gene_set;

% Define gene scores for predefined gene set
GeneSetScore = abs(Corr_score(b2));
Uncorr_score_geneset = abs(Uncorr_score(b2,:));

% remove genes with no score
Uncorr_score_geneset = Uncorr_score_geneset(find(~isnan(GeneSetScore)),:);

% remove genes with no score
GeneSetScore = GeneSetScore(find(~isnan(GeneSetScore)));


% sort all uncorrected gene scores (best SNP per gene p-values) according
% to significance of corrected score, and record chr num and pos of best SNP per gene

clear Uncorr_score_all_sorted_ind
[a, Uncorr_score_all_sorted_ind]= sort(abs(Corr_score),'ascend'); % sort SNPs according to corrected p-values

Sorted_all_uncorr_SNP_Chrpos = Uncorr_score(Uncorr_score_all_sorted_ind,1:2);

score =Corr_score(Uncorr_score_all_sorted_ind); % score of all genes
    
cutoffs=prctile(score,top_percen_cutoffs); % find gene p-value of X percentile of all gene scores in a given GWAS or meta-analysis for use as the enrichment cutoff 
gene_set_size = length(GeneSetScore);

rand_geneset_score = nan(gene_set_size,num_rounds);  % number of genes in gene set by number of permutations
rand_geneset_find = nan(gene_set_size,num_rounds);

for i=1:num_rounds
        
    % randomly choose a gene set of x genes from all genes in genome assigned a score

    clear rand_geneset_ind Saved_gene_ind
    rand_geneset_ind = randsample([1:length(score)],gene_set_size);

    clear Unique_genes_ind
    [a,Unique_genes_ind,j]=unique(Sorted_all_uncorr_SNP_Chrpos(rand_geneset_ind,1:2),'rows','first');
    
    Saved_gene_ind = rand_geneset_ind(Unique_genes_ind);
    
    while (length(Saved_gene_ind)<gene_set_size)
   
        remain_num =  gene_set_size - length(Saved_gene_ind); % number of genes that remain to be randomly sampled from genome
	remain_ind = setdiff([1:length(score)],Saved_gene_ind);    

        clear rand_geneset_ind_add
        rand_geneset_ind_add = randsample(remain_ind,remain_num);

        Updated_gene_ind = unique([Saved_gene_ind, rand_geneset_ind_add]);
        
        clear Unique_genes_ind
        [a,Unique_genes_ind,j]=unique(Sorted_all_uncorr_SNP_Chrpos(Updated_gene_ind,1:2),'rows','first');
        Saved_gene_ind = Updated_gene_ind(Unique_genes_ind);       
    
    end
    
    rand_geneset_score(:,i) = score(Saved_gene_ind); % gene scores of randomly chose gene set of predetermined size
    rand_geneset_find(:,i) = Saved_gene_ind'; % find indeces of randomly chose gene set of predetermined size    

end % for each permutation run


[Obs_GS_RankSum_z_pval]=GSEA_RankSumStat_092409(Corr_score,find_gene_set,score_signif_direct); % wilcoxon rank sum z-score, pvalue

      
for p=1:num_rounds

	[Rand_GS_RankSum_z_pval(p,1:2)]=GSEA_RankSumStat_092409(Corr_score,rand_geneset_find(:,p),score_signif_direct); % wilcoxon rank sum z-score, pvalue

end

GSEA_p(1) = length(find(Rand_GS_RankSum_z_pval(:,1)>=Obs_GS_RankSum_z_pval(1,1)))/num_rounds;  % GSEA p-value=fraction of randomly sampled gene sets that have the
                                                                                    % same or more significant rank sum p-value. compare z-scores. positive is better.

GSEA_p(2) = length(find(Rand_GS_RankSum_z_pval(:,1)<Obs_GS_RankSum_z_pval(1,1)))/num_rounds;  % GSEA p-value=fraction of randomly sampled gene sets that have the
                                                                                    % same or more significant rank sum p-value. compare negative z-scores. negative is better. 

if (GSEA_p(1)==0)
        GSEA_p(1)=0.99/num_rounds; % lower bound
end
   
if (GSEA_p(2)==0)
        GSEA_p(2)=0.99/num_rounds; % lower bound
end 

if (Obs_GS_RankSum_z_pval(1,1)>=0)

	GSEA_p(3) = 1; % one tailed test gene set has more significant gene scores than rest of genome
else

	GSEA_p(3)=0; % one tailed test gene set has less significant gene scores than rest of genome

end

