adding matlab code

author: Calvin <calvin@EESI> 2013-03-11 12:36:42 -0400
committer: Calvin <calvin@EESI> 2013-03-11 12:36:42 -0400
commit: 4ca6f92ceb4b2f8c504431cf56f8a6135187a61c (patch)
tree: bda30f1f0710d19660c04c57f31be37cf4865c50 /src
parent: c18793c210ea44cef63db3568791cbad537bdb79 (diff)
6 files changed, 508 insertions, 0 deletions
diff --git a/src/matlab/Example.m b/src/matlab/Example.m
new file mode 100644
index 0000000..d2a3192
--- /dev/null
+++ b/src/matlab/Example.m
@@ -0,0 +1,52 @@
+% Here is an example of re-training the quikr method, performing the reconstruction, and outputing the results to a CSV file appropriate for Excel.
+mat=quikrTrain('TaxCollectortrainset7_112011.fa',6); %retrain using the TaxCollector version of trainset7_112011.fa
+xstar=quikrCustomTrained(mat,'testfastafile',6,10000); %perform the reconstruction
+quikrWriteOutput('/path/to/output/director','filenameforoutput',full(xstar),'/path/to/TaxCollectortrainset7_112011.fa',2) %write the output in CSV format, the last argument (2) corresponds to the phylum level. See the file quikrWriteOutput.m for more details.
+
+
+
+
+%This is an example of how to run Quikr
+
+%If you want to use the default training-set (that is, trainset7_112011.fa
+%from RPD), then execute the following
+
+%make sure Matlab/Octave is in the Quikr director
+cd /path/to/Quikr
+
+fastafilename='/path/to/your/fastafile.fasta'; %full path name to your data file
+xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
+
+
+%xstar will be on the same basis as trainset7_112011.fa, so to get the
+%sequences that are predicted to be present in your sample:
+[headers,~]=fastaread('trainset7_112011.fa'); %read in the training database
+%note fastaread is not by default included in Octave, the fastaread.m file
+%is included in the Quikr download directory however, and is directly
+%compatible with Matlab.
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
+%so to find which sequence is the most abundant in your mixture:
+[val,ind]=max(xstar(nonzeroentries)); %get the maximum value and it's position
+namesandproportions{1:2,ind} %note that this does not imply this specific strain or species is in your sample, just that phylum/class/order/family/genus this species belongs to is in your sample.
+
+
+% If you would like to use a custom training database, follow the following
+% steps
+fastafilename='/path/to/your/fastafile.fasta'; %full path name to your data file
+trainingdatabasefilename='/path/to/your/trainingdatabase.fasta'; %full path to the FASTA file you wish to use as a training database
+k=6; %pick a k-mer size
+trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
+%then to do the reconstuction
+lambda=10000; %pick a lambda (larger lambda -> theoretically predicted concentrations are closer to actual concentrations), this depends on k-mer size picked, also size and condition of the TrainingMatrix
+xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda); %get the predicted reconstruction frequencies
+
+%again Xstar is on the same basis as the TrainingMatrix, so to get the
+%sequences that are predicted to be present in your sample:
+[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
diff --git a/src/matlab/fastaread.m b/src/matlab/fastaread.m
new file mode 100644
index 0000000..37d9b71
--- /dev/null
+++ b/src/matlab/fastaread.m
@@ -0,0 +1,232 @@
+function [data, seq] = fastaread(filename,varargin)
+%FASTAREAD reads FASTA format file.
+%
+%   S = FASTAREAD(FILENAME) reads a FASTA format file FILENAME, returning
+%   the data in the file as a structure. FILENAME can also be a URL or
+%   MATLAB character array that contains the text of a FASTA format file.
+%   S.Header is the header information. S.Sequence is the sequence stored
+%   as a string of characters.
+%
+%   [HEADER, SEQ] = FASTAREAD(FILENAME) reads the file into separate
+%   variables HEADER and SEQ. If the file contains more than one sequence,
+%   then HEADER and SEQ are cell arrays of header and sequence information.
+%
+%   FASTAREAD(...,'IGNOREGAPS',TF) removes any gap symbol ('-' or '.')
+%   from the sequence(s) when TF is true. Default is false.
+%
+%   FASTAREAD(...,'BLOCKREAD', M) allows you to read in a single entry or
+%   block of entries from a file containing multiple sequences. If M is a
+%   scalar then the M'th entry in the file is read. If M is a two element
+%   vector then the block of entries starting at entry M(1) and ending at
+%   entry M(2) will be read.  Use Inf for M(2) to read all entries in the
+%   file starting at position M(1). 
+%
+%   FASTAREAD(...,'TRIMHEADERS',TF) trims the header after the first
+%   whitespace when TF is true. White space characters include a horizontal
+%   tab (char(9)) and a space (char(32)). Default is false. 
+%
+%   FASTA format specified here:
+%   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
+%
+%   Examples:
+%
+%       % Read the sequence for the human p53 tumor gene.
+%       p53nt = fastaread('p53nt.txt')
+%
+%       % Read the sequence for the human p53 tumor protein.
+%       p53aa = fastaread('p53aa.txt')
+%
+%       % Read a block of entries from a file
+%       pf2_5_10 = fastaread('pf00002.fa','blockread',[ 5 10], ...
+%                            'ignoregaps',true)
+%
+%       % Read the human mitochondrion genome in FASTA format.
+%       entrezSite = 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?'
+%       textOptions = '&txt=on&view=fasta'
+%       genbankID = '&list_uids=NC_012920'
+%       mitochondrion = fastaread([entrezSite textOptions genbankID])
+%
+%   See also EMBLREAD, FASTAINFO, FASTAWRITE, FASTQINFO, FASTQREAD,
+%   FASTQWRITE, GENBANKREAD, GENPEPTREAD, HMMPROFDEMO, MULTIALIGNREAD,
+%   MULTIALIGNVIEWER, SEQPROFILE, SEQTOOL, SFFINFO, SFFREAD.
+
+%   Copyright 2002-2010 The MathWorks, Inc.
+%   $Revision: 1.15.4.21 $  $Date: 2010/06/26 04:54:38 $
+
+% check input is char
+% in a future version we may accept also cells
+if ~ischar(filename)
+    error('Bioinfo:fastaread:InvalidInput','Input must be a character array')
+end
+
+% default
+ignoreGaps = false;
+trimh = false;
+blockRead = false;
+% get input arguments
+if  nargin > 1
+    if rem(nargin,2) ~= 1
+        error('Bioinfo:fastaread:IncorrectNumberOfArguments',...
+            'Incorrect number of arguments to %s.',mfilename);
+    end
+    okargs = {'ignoregaps','blockread','trimheaders'};
+    for j=1:2:nargin-1
+        pname = varargin{j};
+        pval = varargin{j+1};
+        k = find(strncmpi(pname,okargs,numel(pname)));
+        if isempty(k)
+            error('Bioinfo:fastaread:UnknownParameterName',...
+                'Unknown parameter name: %s.',pname);
+            %elseif length(k)>1
+            %    error('Bioinfo:fastaread:AmbiguousParameterName',...
+            %        'Ambiguous parameter name: %s.',pname);
+        else
+            switch(k)
+                case 1  % ignore gaps
+                    ignoreGaps = opttf(pval);
+                    if isempty(ignoreGaps)
+                        error('Bioinfo:fastaread:IgnoreGapsNotLogical',...
+                            '%s must be a logical value, true or false.',...
+                            upper(char(okargs(k))));
+                    end
+                case 2  % range
+                    range = pval;
+                    if ~isnumeric(range) || numel(range)> 2 || isempty(range)
+                        error('Bioinfo:fastaread:BadBlockRange',...
+                            'BlockRange must be numeric scalar or vector with two values.')
+                    end
+                    blockRead = true;
+                    range = sort(range);
+                case 3 % trimheaders
+			        trimh = opttf(pval, okargs{k}, mfilename);    
+            end
+        end
+    end
+end
+
+
+if size(filename,1)>1  % is padded string
+    if blockRead
+        warning('Bioinfo:fastaread:IgnoredRange',...
+            'BLOCKREAD is only allowed when parsing a file.')
+    end
+    ftext = cell(size(filename,1),1);
+    for i=1:size(filename,1)
+        ftext(i,1)=strtrim(strread(filename(i,:),'%s','whitespace','','delimiter','\n'));
+    end
+    % try then if it is an url
+elseif (strfind(filename(1:min(10,end)), '://'))
+    if blockRead
+        warning('Bioinfo:fastaread:IgnoredRange',...
+            'BLOCKREAD is only allowed when parsing a file.')
+    end
+    if (~usejava('jvm'))
+        error('Bioinfo:fastaread:NoJava',...
+            'Reading from a URL requires Java.')
+    end
+    try
+        ftext = urlread(filename);
+    catch allExceptions
+        error('Bioinfo:fastaread:CannotReadURL',...
+            'Cannot read URL "%s".', filename);
+    end
+    ftext = strread(ftext,'%s','delimiter','\n');
+
+    % try then if it is a valid filename
+elseif  (exist(filename,'file') || exist(fullfile(pwd,filename),'file'))
+    if blockRead
+        blockText = getfileblock(filename,range,'>');
+        try
+           ftext = strread(blockText,'%s','delimiter','\n');
+        catch theErr
+           if strcmpi(theErr.identifier,'MATLAB:nomem')
+                error( 'Bioinfo:fastaread:BlockTooBig',...
+                  'The block is too large to fit in memory. Try a smaller range.');
+           else
+                 rethrow(theErr);
+           end
+        end
+    else %=== read entire file
+        fid = fopen(filename);
+		c = onCleanup(@()fclose(fid));
+        try
+            ftext = textscan(fid,'%s','delimiter','\n','bufsize',10000000);
+            ftext = ftext{:}; 
+        catch theErr
+            if strcmpi(theErr.identifier,'MATLAB:nomem')
+                error( 'Bioinfo:fastaread:FileTooBig',...
+                    'The file is too large to fit in memory. Try using the BLOCKREAD option to read the file in blocks.');
+            else 
+                rethrow(theErr);
+            end
+        end
+    end
+else  % must be a string with '\n', convert to cell
+    if blockRead
+        warning('Bioinfo:fastaread:IgnoredRange',...
+            'BLOCKREAD is only allowed when parsing a file.')
+    end
+    ftext = strread(filename,'%s','delimiter','\n');
+end
+
+% it is possible that there will be multiple sequences
+commentLines = strncmp(ftext,'>',1);
+
+if ~any(commentLines)
+    error('Bioinfo:fastaread:FastaNotValid',...
+        'Input does not exist or is not a valid FASTA file.')
+end
+
+numSeqs = sum(commentLines);
+seqStarts = [find(commentLines); size(ftext,1)+1];
+data(numSeqs,1).Header = '';
+
+try
+    for theSeq = 1:numSeqs
+        % Check for > symbol ?
+        data(theSeq).Header = ftext{seqStarts(theSeq)}(2:end);
+        firstRow = seqStarts(theSeq)+1;
+        lastRow = seqStarts(theSeq+1)-1;
+        numChars = cellfun('length',ftext(firstRow:lastRow));
+        numSymbols = sum(numChars);
+        data(theSeq).Sequence = repmat(' ',1,numSymbols);
+        pos = 1;
+        for i=firstRow:lastRow,
+            str = strtrim(ftext{i});
+            len =  length(str);
+            if len == 0
+                break
+            end
+            data(theSeq).Sequence(pos:pos+len-1) = str;
+            pos = pos+len;
+        end
+        data(theSeq).Sequence = strtrim(data(theSeq).Sequence);
+        if ignoreGaps
+            data(theSeq).Sequence = strrep(data(theSeq).Sequence,'-','');
+            data(theSeq).Sequence = strrep(data(theSeq).Sequence,'.','');
+        end
+    end
+
+    % trim headers
+    if trimh
+       for i = 1:numSeqs
+          data(i).Header = sscanf(data(i).Header,'%s',1);
+       end
+    end
+    
+    % in case of two outputs
+    if nargout == 2
+        if numSeqs == 1
+            seq = data.Sequence;
+            data = data.Header;
+        else
+            seq = {data(:).Sequence};
+            data = {data(:).Header};
+        end
+    end
+
+catch allExceptions
+    error('Bioinfo:fastaread:IncorrectDataFormat',...
+        'Incorrect data format in fasta file')
+end
+
diff --git a/src/matlab/quikr.m b/src/matlab/quikr.m
new file mode 100644
index 0000000..d24596e
--- /dev/null
+++ b/src/matlab/quikr.m
@@ -0,0 +1,52 @@
+function xstar = quikr(inputfasta)
+%xstar=quikr('path/to/input/fasta/file') returns the estimated frequencies
+%of bacteria present when given an input FASTA file of amplicon (454)
+%reads. A k-mer based, L1-regularized, sparsity promoting algorithm is
+%utilized. This is the default implementation of Quikr, here lambda=10,000, k-mer
+%size is 6, and we use the trainset7_112011 database from RDP to estimate
+%the present species. In practice, reconstruction is accurate only down to
+%the genus level (not species or strain).
+%The program ./count-linux or ./count-osx needs to be placed in the same 
+%directory as this script. In Linux, you might need to: chmod a+rx count.
+%Also the inputfasta needs to be a standard FASTA file. The input FASTA file must have reads all in the same
+%orientation, so as a preprocessing step, make sure everything is in the
+%forward (+) orientation.
+if nargin>1
+    error('too many input arguments');
+end
+if (isunix && not(ismac))   %If linux, use ./count-linux
+    [status, counts]=unix(['./count-linux -r 6 -1 -u ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels
+    if status ~= 0
+        error('./count-linux failed: ensure count-linux is an executable. Try chmod a+rx count-linux. Be sure matlab/octave is in the same directory as count-linux');
+    end
+elseif ismac %If mac, then use ./count-osx
+    [status, counts]=unix(['./count-osx -r 6 -1 -u ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels
+    if status ~= 0
+        error('./count-osx failed: ensure count-linux is an executable. Try chmod a+rx count-osx. Be sure matlab/octave is in the same directory as count-osx');
+    end
+elseif ispc
+    error('Windows is not yet supported');
+end    
+counts=textscan(counts,'%f'); %convert into floats
+counts=counts{:}; %make into a vector
+counts=counts/sum(counts); %form a probability vector from the counts
+lambda=10000; %this is the default lambda value (see the paper), and is used in the formation of trainset7_112011N6Aaux.mat
+yaux=[0;lambda*counts]; % form the sample vector
+load('trainset7_112011N6Aaux.mat') %load the 6-mer sensing matrix...I need to rename this so the variable is just Aaux.
+if not(exist('Aaux2')) %make sure Aaux2 loaded
+    error('The default training database trainset7_112011N6Aaux.mat failed to load. Be sure it exists in the Quikr folder or current directory');
+end
+%based on RDP's trainset 7 from 11/2011
+isOctave = exist('OCTAVE_VERSION') ~= 0; %check to see if running Octave or Matlab
+if isOctave %if octave, need to manually read in sparse-matrix format
+    n=length(Aaux2.jc)-1;
+    ii=repelems(1:n,[1:n; diff(Aaux2.jc)]);
+    Aaux=sparse(Aaux2.ir+1,ii,Aaux2.data);
+else
+    Aaux=Aaux2; %fix the names
+    clear Aaux2;
+end %Matlab automatically reads it in correctly, just need to rename it
+warning off
+xstar=lsqnonneg(Aaux,yaux); %perform the nonnegative least squares
+warning on
+xstar=xstar/sum(xstar); %transform the output into a probability vector. Note this vector is on the same basis as Aaux, so the entries correspond to sequences in trainset7_112011.fa
+\ No newline at end of file
diff --git a/src/matlab/quikrCustomTrained.m b/src/matlab/quikrCustomTrained.m
new file mode 100644
index 0000000..b58aa31
--- /dev/null
+++ b/src/matlab/quikrCustomTrained.m
@@ -0,0 +1,42 @@
+function xstar = quikrCustomTrained(trainingmatrix,inputfasta,k,lambda)
+%xstar=quikrCustomTrained(traininmatrix,inputfasta,k,lambda) is the
+%implementation of qukir that lets you use a custom training matrix
+%"trainingmatrix" on the input data file "inputfasta" for the k-mer size
+%"k" (note this k-mer size must be such that the number of rows of
+%"trainingmatrix"=4^k), with the regularization value "lambda". The vector
+%of predicted concentrations "xstar" is returned on the same basis as
+%"trainingmatrix".
+if nargin~=4
+    error('There must be exactly 4 input arguments: the training matrix, the /path/to/input/fastafile, the k-mer size, and lambda');
+end
+
+[rws, clumns]=size(trainingmatrix); %get the size of the training matrix
+if rws~=4^k
+    error('Wrong k-mer size for input training matrix');
+end
+
+if (isunix && not(ismac))
+    [status, counts]=unix([sprintf('./count-linux -r %d -1 -u ',k) ' ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels.
+     if status ~= 0
+        error('./count-linux failed: ensure count-linux is an executable. Try chmod a+rx count-linux. Be sure matlab/octave is in the same directory as count-linux');
+    end
+elseif ismac
+    [status, counts]=unix([sprintf('./count-osx -r %d -1 -u ',k) ' ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels.
+    if status ~= 0
+        error('./count-osx failed: ensure count-linux is an executable. Try chmod a+rx count-osx. Be sure matlab/octave is in the same directory as count-osx');
+    end
+elseif ispc
+   error('Windows is not yet supported');
+end
+
+counts=textscan(counts,'%f'); %read them in as floats.
+counts=counts{:};
+counts=counts/sum(counts); %normalize the counts into a probability vector
+yaux=[0;lambda*counts]; %form the sample vector
+
+
+Aaux=[ones(1,clumns);lambda*trainingmatrix]; %form the k-mer sensing matrix
+warning off
+xstar=lsqnonneg(Aaux,yaux); %perform the non-negative lease squares
+warning on
+xstar=xstar/sum(xstar); %return the results as a probability vector 
+\ No newline at end of file
diff --git a/src/matlab/quikrTrain.m b/src/matlab/quikrTrain.m
new file mode 100644
index 0000000..7960cca
--- /dev/null
+++ b/src/matlab/quikrTrain.m
@@ -0,0 +1,41 @@
+function mat=quikrTrain(inputfasta,k)
+%matrix=QuikrTrain(inputfasta,k) Returns the (sparse) k-mer sensing
+%matrix from the FASTA file located at 'inputfasta' for the k-mer size k.
+%This serves to retrain the Quikr method. The ouput can then be fed to
+%quikrCustomTrained().
+%Works for k=1:8 (typically the matrices get too large for k>9)
+%The filename for the inputfasta must be the complete path.
+
+if nargin>2
+    error('too many input arguments');
+end
+
+
+[pathtofile,filename,ext]=fileparts(inputfasta); %get the parts of the file
+
+
+outputfilename=fullfile(pathtofile, [filename sprintf('-sensingmatrixK%d.txt',k)]); %Currently this is coded to write a temporary file. In future versions, this will be all be done in RAM
+%The reason for writing the file to disk first is that Matlab typically
+%crashes when unix() returns as many entries as ./probabilities-by-read
+%does (on the order of ~2*10^10).
+
+kmerfilename=sprintf('%dmers.txt',k); %This contains the list of 6-mers to count. In future versions this will be computed locally instead of being read in.
+
+if (isunix && not(ismac)) %this is for the linux version
+    unix(['./probabilities-by-read-linux ' sprintf('%d',k) ' ' inputfasta ' ' kmerfilename ' > ' outputfilename]); %obtain the k-mer counts of the inputfasta read-by-read
+elseif ismac %mac version
+    unix(['./probabilities-by-read-osx ' sprintf('%d',k) ' ' inputfasta ' ' kmerfilename ' > ' outputfilename]); %obtain the k-mer counts of the inputfasta read-by-read
+elseif ispc %No PC version
+    error('Windows is not yet supported');
+end
+
+fid=fopen(outputfilename); %open the output file
+
+%A=textscan(fid,'%f'); %get all the counts
+%A=A{:};
+A=fscanf(fid,'%f');
+mat=sparse(reshape(A,4^k,length(A)/4^k)); %form into a matrix
+mat=bsxfun(@rdivide,mat,sum(mat,1)); %column-normalize
+fclose(fid); %close file
+delete(outputfilename); %delete the file
+
diff --git a/src/matlab/quikrWriteOutput.m b/src/matlab/quikrWriteOutput.m
new file mode 100644
index 0000000..7dd5c3b
--- /dev/null
+++ b/src/matlab/quikrWriteOutput.m
@@ -0,0 +1,89 @@
+function quikrWriteOutput(directory,filename,xstar,pathtotrainingdatabase,taxonomicrank)
+%taxonomic rank: 1=kindom, 2=phylum, 3=class, 4=order, 5=family,
+%6=genus, 7=species, 8=strain
+%quikrWriteOutput(directory,filename,xstar,trainingdatabase,taxonomicrank)
+%will output a Comma Seperated (CSV) file with the file name 'filename' in the directory
+%'directory' that gives the summary of concentrations of the reconstruction
+%at the taxonomic rank specified by 'taxonomicrank'. The database located 
+%at'pathtotrainingdatabase' must be in the fasta format with the headers
+%in the format output by TaxCollector.
+%(see Giongo et al, 2010 TaxCollector: Modifying Current 16S rRNA
+%Databases for the Rapid Classification at Six Taxonomic Levels, Diversity.
+%doi:10.3390/d2071015).
+%The output excel file is suitable for use in visualiztion software such as
+%Krona (Ondov, Bergman, and Phillippy, 2011, Interactive Metagenomic
+%Visualiztion in a Web Browser, BMC bioinformatics,
+%doi:10.1186/1471-2105-12-385).
+if nargin>5
+    error('too many input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank');
+end
+if nargin<5
+    error('too few input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank');
+end
+[headers,~]=fastaread(pathtotrainingdatabase);%read in the headers of the training database
+
+for i=1:length(headers)
+    if not(length(regexp(headers{2},'\[\d\]'))==9)
+        error(sprintf('The header in entry %d in the training database is not in the Tax Collector format: eg ">[1]kingdom[2]phylum[3]class[4]order[5]family[6]genus[7]species[8]strain|accession"',i));
+    end
+end
+
+
+seqnames=headers(find(xstar)); %get the names of the sequences that are given nonzero concentration in the reconstruction xstar
+basis=find(xstar); %the indicies that have nonzero concentrations
+        %seqnamesnew=cell(length(basis),1);
+for i=1:length(basis)
+    seqnamesnew{i}=strcat(sprintf('%f[0] ',xstar(basis(i))),seqnames{i}); %format is concentration, and then the name as it is in the training database
+end
+        
+for i=1:length(seqnamesnew)
+    splitup(i,:)=regexp(seqnamesnew{i},'\[\d\]','split'); %split everything up on the token [%d] as per the TaxCollector format
+end
+%xlswrite(fullfile(directory,[filename '.xls']),splitup);
+%so there's a problem: trainset7_112011 is not in the tax collector format.
+%So one thing I could do is use the accessions or something like that to
+%get the corresponding headers from TaxCollectorRDP10_28 and create a new
+%trainset7_112011 (or at least, one with new headers that then work
+%according to TaxCollector).
+
+%So it does look like that for each header in trainset7_112011, after the >
+%and before the | that "accession" also appears at the end of some header in
+%TaxCollectorRDP10_28 (after the | on the end). So I'd like to make a new
+%trainset7_112011 in which the headers are simply replaced.
+
+
+%Now I'm going to try to unique at the given taxanomic rank, and then tally
+%up the concentrations of all the corresponding entries in splitup
+ranknames=unique(splitup(:,taxonomicrank+2)); %get the unique names at the given taxonomic rank
+concentrations=str2double(splitup(:,1)); %pull off the concentrations
+       
+for i=1:length(ranknames) %for each of the names
+    selectlist=strcmp(splitup(:,taxonomicrank+2),ranknames(i)); %get the rows that have the given name in the proper taxonomic rank
+    %newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),ranknames(i)}; %sum up the concentrations in all these rows and put that next to the names just down to the given taxonomic rank
+    newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),splitup{find(selectlist,1),3:taxonomicrank+2}};
+end
+%xlswrite(fullfile(directory,[filename '.xls']),newsplitup); %write this as the final output
+
+fid=fopen(fullfile(directory,[filename '.csv']),'w');
+if fid==-1
+    error(sprintf('The file %s is currently write-protected (e.g. open or in use in a different program)', fullfile(directory,[filename '.csv'])));
+end
+[nrows,ncols]=size(newsplitup);
+mystr=['%s,'];
+for cols=1:ncols-2
+    mystr=[mystr ' ' '%s,'];
+end
+%mystr=[mystr '\n'];
+for row=1:nrows
+   fprintf(fid,[mystr '%s\n'],newsplitup{row,:});
+end
+fclose(fid);
+
+
+
+
+
+
+
+
+
author	Calvin <calvin@EESI>	2013-03-11 12:36:42 -0400
committer	Calvin <calvin@EESI>	2013-03-11 12:36:42 -0400
commit	4ca6f92ceb4b2f8c504431cf56f8a6135187a61c (patch)
tree	bda30f1f0710d19660c04c57f31be37cf4865c50 /src
parent	c18793c210ea44cef63db3568791cbad537bdb79 (diff)