From 4ca6f92ceb4b2f8c504431cf56f8a6135187a61c Mon Sep 17 00:00:00 2001 From: Calvin Date: Mon, 11 Mar 2013 12:36:42 -0400 Subject: adding matlab code --- src/matlab/quikrTrain.m | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 src/matlab/quikrTrain.m (limited to 'src/matlab/quikrTrain.m') diff --git a/src/matlab/quikrTrain.m b/src/matlab/quikrTrain.m new file mode 100644 index 0000000..7960cca --- /dev/null +++ b/src/matlab/quikrTrain.m @@ -0,0 +1,41 @@ +function mat=quikrTrain(inputfasta,k) +%matrix=QuikrTrain(inputfasta,k) Returns the (sparse) k-mer sensing +%matrix from the FASTA file located at 'inputfasta' for the k-mer size k. +%This serves to retrain the Quikr method. The ouput can then be fed to +%quikrCustomTrained(). +%Works for k=1:8 (typically the matrices get too large for k>9) +%The filename for the inputfasta must be the complete path. + +if nargin>2 + error('too many input arguments'); +end + + +[pathtofile,filename,ext]=fileparts(inputfasta); %get the parts of the file + + +outputfilename=fullfile(pathtofile, [filename sprintf('-sensingmatrixK%d.txt',k)]); %Currently this is coded to write a temporary file. In future versions, this will be all be done in RAM +%The reason for writing the file to disk first is that Matlab typically +%crashes when unix() returns as many entries as ./probabilities-by-read +%does (on the order of ~2*10^10). + +kmerfilename=sprintf('%dmers.txt',k); %This contains the list of 6-mers to count. In future versions this will be computed locally instead of being read in. + +if (isunix && not(ismac)) %this is for the linux version + unix(['./probabilities-by-read-linux ' sprintf('%d',k) ' ' inputfasta ' ' kmerfilename ' > ' outputfilename]); %obtain the k-mer counts of the inputfasta read-by-read +elseif ismac %mac version + unix(['./probabilities-by-read-osx ' sprintf('%d',k) ' ' inputfasta ' ' kmerfilename ' > ' outputfilename]); %obtain the k-mer counts of the inputfasta read-by-read +elseif ispc %No PC version + error('Windows is not yet supported'); +end + +fid=fopen(outputfilename); %open the output file + +%A=textscan(fid,'%f'); %get all the counts +%A=A{:}; +A=fscanf(fid,'%f'); +mat=sparse(reshape(A,4^k,length(A)/4^k)); %form into a matrix +mat=bsxfun(@rdivide,mat,sum(mat,1)); %column-normalize +fclose(fid); %close file +delete(outputfilename); %delete the file + -- cgit v1.2.3