% Dec. 15, 2008
% Ayellet Segre, Altshuler Lab

% Program extract genes within a given physical boundary around each input SNP

function Extract_genes_around_SNPs(Input_rs_file_name,bound,disease_name);

%system('use matlab')

% Commands for running program on cluster (unix command line):
% -----------------------------------------------------------
% (1) First type: 'use matlab'
% (2) Then type:
% bsub -o output_file_name -e error_file_name matlab -nodisplay -nodesktop -nosplash -nojvm -r "Extract_genes_around_SNPs('Input_rs_file_name',boundary,'disease_name')"

% 'output_file_name' and 'error_file_name' contain information about the program run. These are not the output files with the gene lists.
% The 'output_file_name' contains information on the total number of genes found around all SNPs in the input list and whether there are any SNPs 
% that do have genes within the specified window around the SNP.

% Commands for running program in matlab:
% -----------------------------------------
%Extract_genes_around_SNPs('Input_rs_file_name',boundary,'disease_name')


% Description of input files/variables:
% -------------------------------------
% (1) 'Input_rs_file_name': Name of file that contains a table of SNP information in the following format:
% Tab-delimited table of three columns:
% column 1: rs #; column 2: chromosome number of SNP; column 3: chromosome position of SNP in bp units (each row refers to a 
% differnt SNP)
% (2) 'boundary': distance on either side of each SNP in which to extract genes, in base pair units (e.g. 300000 (=300kb))
% (3) 'disease name': Name of disease or study
% (4) You also need the following two files in the same directory as this program: 
% AllHumanGeneChrPosStrand_18434Genes_RefFlat_111909, RefFlatGeneSymbolGeneID_18434Genes_111909

% Output files:
% -------------
% (1) 'Genes_near_SNP_Xkb_boundary_GeneDistSNP_DISEASENAME_TODAYSDATE'
% Table that includes all input SNPs with a list of genes within a +/-X kb window around the SNP 
% (each row in table refers to a SNP) 	

% (2) 'Distance_genes_near_SNP_Xkb_boundary_DISEASENAME_TODAYSDATE'
% Table of distances of each gene within the +/-X kb window around from its corresponding SNP (each row refers to a different gene)

% Note: 18434 human genes are considered in this script (input file: 'AllHumanGeneChrPosStrand_18434Genes_RefFlat_111909'). 237 genes were 
% removed as they have different mRNA transcipts that are either >1Mb apart on the same chromosome or on different chromosomes. 
% Hence some genes may be missed. 


%%%% 		PROGRAM BEGINS HERE		%%%%

% record today's date:
dash=abs('-');                                                                                            
date_string=date;                                                                                   
date_string_in_numbers=abs(date_string);                                                                      
positions_of_dash=find(date_string_in_numbers==dash);   
todays_date = [num2str(date_string((positions_of_dash(1)+1):(positions_of_dash(2)-1))), num2str(date_string(1:(positions_of_dash(1)-1))), '_' , num2str(date_string((positions_of_dash(2)+3):length(date_string)))];

% load input tables:
% table of 18434 human genes with their chromosome positions
load AllHumanGeneChrPosStrand_18434Genes_RefFlat_111909;

clear AllHumanGeneChrPos
AllHumanGeneChrPos = AllHumanGeneChrPosStrand_18434Genes_RefFlat_111909;

% list of human Gene Symbols and Gene IDs 
clear GeneIDs AllGeneRefFlatNames
[GeneIDs, AllGeneRefFlatNames]=textread('RefFlatGeneSymbolGeneID_18434Genes_111909','%n%s');


% list of input SNPs (rs #, chr #, chr position)
clear SNP_rs SNP_ChrNum SNP_ChrPos
read_SNP_table_com=['[SNP_rs, SNP_ChrNum, SNP_ChrPos] = textread(''', num2str(Input_rs_file_name),  ''',''%s%n%n'');'];
eval(read_SNP_table_com);

% open output file 1 
outputfile = ['Genes_near_SNP_', num2str(bound/1000) , 'kb_boundary_', num2str(disease_name), '_' , num2str(todays_date) ];
FID=fopen(outputfile,'w');

% print header
fprintf (FID, ['rs #, SNP Chr #, SNP Chr position, Gene symbol|Gene ID\n']);

% open output file 2
outputfile2 = ['Distance_genes_near_SNP_', num2str(bound/1000), 'kb_boundary_', num2str(disease_name), '_' , num2str(todays_date)];
FID2=fopen(outputfile2,'w');

% print header
fprintf (FID2, ['rs #, SNP Chr #, SNP Chr position, Gene symbol, Gene ID, Gene Chr num, Transcript Chr start position (bp), Transcript Chr end position (bp), Presence of SNP in gene (1,0), SNP position minus gene start position (bp), SNP position minus gene end position (bp)\n']);

disp(['Output file names are: ', num2str(outputfile), ' and ', num2str(outputfile2)]);

count_num_SNPs_nogenes=0;
count_num_genes_no_GeneID=0;
tot_num_genes = 0;

for i=1:length(SNP_rs)

        % find genes that either lie within the interval, overlap the interval partially or fully (in the later case a single gene overlaps and is larger than the whole interval)
    
	clear find_genes_rs
        find_genes_rs = find( (AllHumanGeneChrPos(:,1) == SNP_ChrNum(i) & AllHumanGeneChrPos(:,3) >= (SNP_ChrPos(i)-bound) & AllHumanGeneChrPos(:,3) <= (SNP_ChrPos(i)+bound)) | (AllHumanGeneChrPos(:,1) == SNP_ChrNum(i) & AllHumanGeneChrPos(:,2) <= (SNP_ChrPos(i)+bound) & AllHumanGeneChrPos(:,2) >= (SNP_ChrPos(i)-bound) )  | (AllHumanGeneChrPos(:,1) == SNP_ChrNum(i) & AllHumanGeneChrPos(:,2) <= (SNP_ChrPos(i)-bound) &  AllHumanGeneChrPos(:,3) >= (SNP_ChrPos(i)+bound))  );

	fprintf (FID, '%s\t', [num2str(SNP_rs{i})]);
	fprintf (FID, '%1.0f\t%1.0f\t', SNP_ChrNum(i), SNP_ChrPos(i));

	if (find(find_genes_rs))

		for j=1:length(find_genes_rs) % for each gene found in interval
		
		        tot_num_genes=tot_num_genes+1;

			gene_ind = find_genes_rs(j);
			gene_name = AllGeneRefFlatNames{gene_ind};
			gene_ID =  GeneIDs(gene_ind);

			fprintf (FID, '%s', [num2str(gene_name)], '|');
			fprintf (FID, '%1.0f', gene_ID);
        		fprintf(FID,'\t');
 
			fprintf (FID2, '%s\t', [num2str(SNP_rs{i})]);
                        fprintf (FID2, '%1.0f\t%1.0f\t', SNP_ChrNum(i), SNP_ChrPos(i));
			fprintf (FID2, '%s\t', [num2str(gene_name)]);	% gene symbol
			fprintf (FID2, '%1.0f\t', gene_ID);	% gene ID
			fprintf (FID2, '%1.0f\t%1.0f\t%1.0f\t', AllHumanGeneChrPos(gene_ind,1:3));	% gene chr num, chr start pos, chr end pos

			if (SNP_ChrPos(i) >= AllHumanGeneChrPos(gene_ind,2) & SNP_ChrPos(i) <= AllHumanGeneChrPos(gene_ind,3))
				SNP_in_gene = 1;	% 1=SNP in gene 
				fprintf (FID2, '%1.0f\t', SNP_in_gene);
			else
				SNP_in_gene = 0; 	% 0=SNP outside gene
				fprintf (FID2, '%1.0f\t', SNP_in_gene);
			end

			Dist_SNP_gene_start = SNP_ChrPos(i) - AllHumanGeneChrPos(gene_ind,2); % Distance of SNP from gene start position: positive = SNP downstream to start site, negative = SNP upstream to start site
			Dist_SNP_gene_end = SNP_ChrPos(i) - AllHumanGeneChrPos(gene_ind,3); % Distance of SNP from gene end position: positive = SNP downstream to end site, negative = SNP upstream to end site

			fprintf (FID2, '%1.0f\t', Dist_SNP_gene_start);
			fprintf (FID2, '%1.0f\t', Dist_SNP_gene_end);

			fprintf (FID2, '\n');

               end 	% for each gene overlapping interval


	else 

 		fprintf (FID, '\n'); 
		fprintf (FID2, '\n');      

		count_num_SNPs_nogenes=count_num_SNPs_nogenes+1;

	end % if genes found in interval or overlapping interval

        fprintf (FID, '\n');

end	% for each associated SNP

fclose(FID);
fclose(FID2);

disp(['There are ', num2str(count_num_SNPs_nogenes), ' SNPs with no genes within ', num2str(bound), ' base pairs around the SNP.']);
disp(['There are a total of ', num2str(tot_num_genes), ' genes within ', num2str(bound) , ' base pairs around all SNPs.']);
