%       Copyright, February 2009, Ayellet Segre, Mark Daly, David Altshuler, Broad Institute, 7 Cambridge Center, Cambridge, MA 02142, USA
%
%       This code is part of the MAGENTA software package vs1.1 written in Matlab version R2009b, that tests for enrichment of multiple modest
%       genetic effects on a given complex disease or trait, in predefined sets of genes or loci.
%       The main code from which all functions are run is called: Run_MAGENTA_vs1_May10_2010.m
%
%       This software accompanies the paper:
%       Ayellet V. Segre, DIAGRAM Consortium, MAGIC investigators, Leif Groop, Vamsi K. Mootha, Mark J. Daly, and David Altshuler. Common Inherited Variation in
%       Mitochondrial Genes is not Enriched for Associations with Type 2 Diabetes or Related Glycemic Traits. In revision at PLoS Genetics, May 2010.
%
%       If your work benefits from the use of the MAGENTA  software package please cite the reference above.
%
%       For questions or comments please contact Ayellet Segre at asegre@broadinstitute.org. You can check for updates at: http://www.broadinstitute.org/mpg/magenta
%
%       Disclaimer: This software is distributed as is. The authors take no responsibility for any use or misuse.
%
%       Last updated: May 10, 2010
%

function [Scores_X,num_SNPs_per_gene,Best_SNP_rs]=ExtractGeneScoreBestSNP_PvalZscore_NumSNPsPerGene_092909(GeneSubsetChrPos,All_SNP_scores_pos,interval_up, interval_down,strand,best_pval_or_z,SNP_rs);

% best_pval_or_z=1 use p-values to find best SNP per gene; best_pval_or_z=0 use z-score to find best SNP per gene.

% Fucntion based on 'ExtractGeneScoreBestSNPCountGenesPerSNP_101408'
% This pogram only deals with z-scores, modified from: Simul_GSEA_power_121908.m, for: GeneSetEnrichAnal_GWAS_pipeline_021809.m

% GeneSubsetChrPos: all human gene chromosome positions (first 3 columns (1) chr number, (2) most left transcript boundary, (3) most right transcript boundary)

% All_SNP_scores_pos:  all SNP association scores; columns: (1) SNP chr num, (2) SNP chr pos, (3) SNP association test statistic, (4) SNP association p-value

[num_genes,c]=size(GeneSubsetChrPos);

Scores_X = zeros(num_genes,4);  % columns: (1) SNP chr num, (2) SNP chr pos, (3) test statistic, (4) p-value

Best_SNP_rs = cell(1,num_genes);

num_SNPs_per_gene = zeros(num_genes,1);  % vector that records number of SNPs assigned to a given gene

count_num_Genes_NaN_scores=0;  % count number of genes that have SNPs in their target region but that didn't not received an association score (NaN)


for gene=1:num_genes 

	if (strand(gene)==1) % gene on positive strand
    		find_SNPs_near_gene = find(All_SNP_scores_pos(:,1)== GeneSubsetChrPos(gene,1) & All_SNP_scores_pos(:,2) >= (GeneSubsetChrPos(gene,2)-interval_up) & All_SNP_scores_pos(:,2) <= (GeneSubsetChrPos(gene,3)+interval_down) );
	else % gene on negative strand
    		find_SNPs_near_gene = find(All_SNP_scores_pos(:,1)== GeneSubsetChrPos(gene,1) & All_SNP_scores_pos(:,2) >= (GeneSubsetChrPos(gene,2)-interval_down) & All_SNP_scores_pos(:,2) <= (GeneSubsetChrPos(gene,3)+interval_up) );
    end
    
    if (find_SNPs_near_gene)  % if there are SNPs within the gene's target region
    
      	% make sure the SNPs in the gene target region have scores other than NaN   
      	num_stat_not_NaN = length(find(~isnan(All_SNP_scores_pos(find_SNPs_near_gene,3))));  % test statistic
 
       	if (num_stat_not_NaN==0) % if none of the SNPs in the gene boundaries have GWA scores

            count_num_Genes_NaN_scores=count_num_Genes_NaN_scores+1;
	    
            Scores_X(gene,1)=NaN;
            Scores_X(gene,2)=NaN;
            Scores_X(gene,3)=NaN;
            Scores_X(gene,4)=NaN;
            Best_SNP_rs{gene}=NaN;

    		num_SNPs_per_gene(gene,1)=NaN;

        else

     		num_SNPs_per_gene(gene,1) = num_stat_not_NaN; % number of SNPs per gene with an association score

if (best_pval_or_z==0)  % use z-score to find most significant local SNP

	        max_pos_val = max(All_SNP_scores_pos(find_SNPs_near_gene,3) ); % maximum test statistic
        	min_neg_val = min(All_SNP_scores_pos(find_SNPs_near_gene,3) ); % minimum test statistic
        
	        if (abs(max_pos_val(1))>=abs(min_neg_val(1)))
        	    best_val = max_pos_val(1); 
            else
                best_val = min_neg_val(1);      
            end

	best_val=max_pos_val;

            % find position of best SNP per gene
            clear find_max_abs_Zscore
           if (strand(gene)==1) % gene on positive strand
                find_max_abs_Zscore = find(All_SNP_scores_pos(:,3) == best_val & All_SNP_scores_pos(:,1)== GeneSubsetChrPos(gene,1) & All_SNP_scores_pos(:,2) >= (GeneSubsetChrPos(gene,2)-interval_up) & All_SNP_scores_pos(:,2) <= (GeneSubsetChrPos(gene,3)+interval_down) );
           else % gene on negative strand
                find_max_abs_Zscore = find(All_SNP_scores_pos(:,3) == best_val & All_SNP_scores_pos(:,1)== GeneSubsetChrPos(gene,1) & All_SNP_scores_pos(:,2) >= (GeneSubsetChrPos(gene,2)-interval_down) & All_SNP_scores_pos(:,2) <= (GeneSubsetChrPos(gene,3)+interval_up) );

           end

else   % use p-value to find most significant local SNP

	clear min_pval
	min_pval = min(All_SNP_scores_pos(find_SNPs_near_gene,4));

            % find position of best SNP per gene
            clear find_max_abs_Zscore
           if (strand(gene)==1) % gene on positive strand
                find_max_abs_Zscore = find(All_SNP_scores_pos(:,4) == min_pval(1)  & All_SNP_scores_pos(:,1)== GeneSubsetChrPos(gene,1) & All_SNP_scores_pos(:,2) >= (GeneSubsetChrPos(gene,2)-interval_up) &  All_SNP_scores_pos(:,2) <= (GeneSubsetChrPos(gene,3)+interval_down) );
           else % gene on negative strand
                find_max_abs_Zscore = find(All_SNP_scores_pos(:,4) == min_pval(1)  & All_SNP_scores_pos(:,1)== GeneSubsetChrPos(gene,1) & All_SNP_scores_pos(:,2) >= (GeneSubsetChrPos(gene,2)-interval_down) & All_SNP_scores_pos(:,2) <= (GeneSubsetChrPos(gene,3)+interval_up) );
           end

end
                Scores_X(gene,1) = All_SNP_scores_pos(find_max_abs_Zscore(1),1); % chr num
                Scores_X(gene,2) = All_SNP_scores_pos(find_max_abs_Zscore(1),2); % chr pos;
	        Scores_X(gene,3) = All_SNP_scores_pos(find_max_abs_Zscore(1),3);  % records maximum absolute Z-score value with original +/- sign; % 
	        Scores_X(gene,4) = All_SNP_scores_pos(find_max_abs_Zscore(1),4);  % records p-value

if (length(SNP_rs)>1)

		Best_SNP_rs{gene}=SNP_rs{find_max_abs_Zscore(1)};
else
		Best_SNP_rs{gene}=NaN;
end

	  end  % if there are no scores to the SNPs in the gene target region that are not NaN
       
	else   % if there aren't any SNPs in the gene target region

        	Scores_X(gene,1)=NaN;
	        Scores_X(gene,2)=NaN;
        	Scores_X(gene,3)=NaN;
                Scores_X(gene,4)=NaN;
		Best_SNP_rs{gene}=NaN;        	                
    end
    
end   % for each gene

disp(['There are ', num2str(count_num_Genes_NaN_scores) , ' genes with SNPs in their target region that were not assigned an association score (NaN).\n'])
