diff options
| author | Calvin <calvin@EESI> | 2013-03-11 12:36:42 -0400 | 
|---|---|---|
| committer | Calvin <calvin@EESI> | 2013-03-11 12:36:42 -0400 | 
| commit | 4ca6f92ceb4b2f8c504431cf56f8a6135187a61c (patch) | |
| tree | bda30f1f0710d19660c04c57f31be37cf4865c50 /src/matlab | |
| parent | c18793c210ea44cef63db3568791cbad537bdb79 (diff) | |
adding matlab code
Diffstat (limited to 'src/matlab')
| -rw-r--r-- | src/matlab/Example.m | 52 | ||||
| -rw-r--r-- | src/matlab/fastaread.m | 232 | ||||
| -rw-r--r-- | src/matlab/quikr.m | 52 | ||||
| -rw-r--r-- | src/matlab/quikrCustomTrained.m | 42 | ||||
| -rw-r--r-- | src/matlab/quikrTrain.m | 41 | ||||
| -rw-r--r-- | src/matlab/quikrWriteOutput.m | 89 | 
6 files changed, 508 insertions, 0 deletions
| diff --git a/src/matlab/Example.m b/src/matlab/Example.m new file mode 100644 index 0000000..d2a3192 --- /dev/null +++ b/src/matlab/Example.m @@ -0,0 +1,52 @@ +% Here is an example of re-training the quikr method, performing the reconstruction, and outputing the results to a CSV file appropriate for Excel.
 +mat=quikrTrain('TaxCollectortrainset7_112011.fa',6); %retrain using the TaxCollector version of trainset7_112011.fa
 +xstar=quikrCustomTrained(mat,'testfastafile',6,10000); %perform the reconstruction
 +quikrWriteOutput('/path/to/output/director','filenameforoutput',full(xstar),'/path/to/TaxCollectortrainset7_112011.fa',2) %write the output in CSV format, the last argument (2) corresponds to the phylum level. See the file quikrWriteOutput.m for more details.
 +
 +
 +
 +
 +%This is an example of how to run Quikr
 +
 +%If you want to use the default training-set (that is, trainset7_112011.fa
 +%from RPD), then execute the following
 +
 +%make sure Matlab/Octave is in the Quikr director
 +cd /path/to/Quikr
 +
 +fastafilename='/path/to/your/fastafile.fasta'; %full path name to your data file
 +xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
 +
 +
 +%xstar will be on the same basis as trainset7_112011.fa, so to get the
 +%sequences that are predicted to be present in your sample:
 +[headers,~]=fastaread('trainset7_112011.fa'); %read in the training database
 +%note fastaread is not by default included in Octave, the fastaread.m file
 +%is included in the Quikr download directory however, and is directly
 +%compatible with Matlab.
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 +%so to find which sequence is the most abundant in your mixture:
 +[val,ind]=max(xstar(nonzeroentries)); %get the maximum value and it's position
 +namesandproportions{1:2,ind} %note that this does not imply this specific strain or species is in your sample, just that phylum/class/order/family/genus this species belongs to is in your sample.
 +
 +
 +% If you would like to use a custom training database, follow the following
 +% steps
 +fastafilename='/path/to/your/fastafile.fasta'; %full path name to your data file
 +trainingdatabasefilename='/path/to/your/trainingdatabase.fasta'; %full path to the FASTA file you wish to use as a training database
 +k=6; %pick a k-mer size
 +trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
 +%then to do the reconstuction
 +lambda=10000; %pick a lambda (larger lambda -> theoretically predicted concentrations are closer to actual concentrations), this depends on k-mer size picked, also size and condition of the TrainingMatrix
 +xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda); %get the predicted reconstruction frequencies
 +
 +%again Xstar is on the same basis as the TrainingMatrix, so to get the
 +%sequences that are predicted to be present in your sample:
 +[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 diff --git a/src/matlab/fastaread.m b/src/matlab/fastaread.m new file mode 100644 index 0000000..37d9b71 --- /dev/null +++ b/src/matlab/fastaread.m @@ -0,0 +1,232 @@ +function [data, seq] = fastaread(filename,varargin)
 +%FASTAREAD reads FASTA format file.
 +%
 +%   S = FASTAREAD(FILENAME) reads a FASTA format file FILENAME, returning
 +%   the data in the file as a structure. FILENAME can also be a URL or
 +%   MATLAB character array that contains the text of a FASTA format file.
 +%   S.Header is the header information. S.Sequence is the sequence stored
 +%   as a string of characters.
 +%
 +%   [HEADER, SEQ] = FASTAREAD(FILENAME) reads the file into separate
 +%   variables HEADER and SEQ. If the file contains more than one sequence,
 +%   then HEADER and SEQ are cell arrays of header and sequence information.
 +%
 +%   FASTAREAD(...,'IGNOREGAPS',TF) removes any gap symbol ('-' or '.')
 +%   from the sequence(s) when TF is true. Default is false.
 +%
 +%   FASTAREAD(...,'BLOCKREAD', M) allows you to read in a single entry or
 +%   block of entries from a file containing multiple sequences. If M is a
 +%   scalar then the M'th entry in the file is read. If M is a two element
 +%   vector then the block of entries starting at entry M(1) and ending at
 +%   entry M(2) will be read.  Use Inf for M(2) to read all entries in the
 +%   file starting at position M(1). 
 +%
 +%   FASTAREAD(...,'TRIMHEADERS',TF) trims the header after the first
 +%   whitespace when TF is true. White space characters include a horizontal
 +%   tab (char(9)) and a space (char(32)). Default is false. 
 +%
 +%   FASTA format specified here:
 +%   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
 +%
 +%   Examples:
 +%
 +%       % Read the sequence for the human p53 tumor gene.
 +%       p53nt = fastaread('p53nt.txt')
 +%
 +%       % Read the sequence for the human p53 tumor protein.
 +%       p53aa = fastaread('p53aa.txt')
 +%
 +%       % Read a block of entries from a file
 +%       pf2_5_10 = fastaread('pf00002.fa','blockread',[ 5 10], ...
 +%                            'ignoregaps',true)
 +%
 +%       % Read the human mitochondrion genome in FASTA format.
 +%       entrezSite = 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?'
 +%       textOptions = '&txt=on&view=fasta'
 +%       genbankID = '&list_uids=NC_012920'
 +%       mitochondrion = fastaread([entrezSite textOptions genbankID])
 +%
 +%   See also EMBLREAD, FASTAINFO, FASTAWRITE, FASTQINFO, FASTQREAD,
 +%   FASTQWRITE, GENBANKREAD, GENPEPTREAD, HMMPROFDEMO, MULTIALIGNREAD,
 +%   MULTIALIGNVIEWER, SEQPROFILE, SEQTOOL, SFFINFO, SFFREAD.
 +
 +%   Copyright 2002-2010 The MathWorks, Inc.
 +%   $Revision: 1.15.4.21 $  $Date: 2010/06/26 04:54:38 $
 +
 +% check input is char
 +% in a future version we may accept also cells
 +if ~ischar(filename)
 +    error('Bioinfo:fastaread:InvalidInput','Input must be a character array')
 +end
 +
 +% default
 +ignoreGaps = false;
 +trimh = false;
 +blockRead = false;
 +% get input arguments
 +if  nargin > 1
 +    if rem(nargin,2) ~= 1
 +        error('Bioinfo:fastaread:IncorrectNumberOfArguments',...
 +            'Incorrect number of arguments to %s.',mfilename);
 +    end
 +    okargs = {'ignoregaps','blockread','trimheaders'};
 +    for j=1:2:nargin-1
 +        pname = varargin{j};
 +        pval = varargin{j+1};
 +        k = find(strncmpi(pname,okargs,numel(pname)));
 +        if isempty(k)
 +            error('Bioinfo:fastaread:UnknownParameterName',...
 +                'Unknown parameter name: %s.',pname);
 +            %elseif length(k)>1
 +            %    error('Bioinfo:fastaread:AmbiguousParameterName',...
 +            %        'Ambiguous parameter name: %s.',pname);
 +        else
 +            switch(k)
 +                case 1  % ignore gaps
 +                    ignoreGaps = opttf(pval);
 +                    if isempty(ignoreGaps)
 +                        error('Bioinfo:fastaread:IgnoreGapsNotLogical',...
 +                            '%s must be a logical value, true or false.',...
 +                            upper(char(okargs(k))));
 +                    end
 +                case 2  % range
 +                    range = pval;
 +                    if ~isnumeric(range) || numel(range)> 2 || isempty(range)
 +                        error('Bioinfo:fastaread:BadBlockRange',...
 +                            'BlockRange must be numeric scalar or vector with two values.')
 +                    end
 +                    blockRead = true;
 +                    range = sort(range);
 +                case 3 % trimheaders
 +			        trimh = opttf(pval, okargs{k}, mfilename);    
 +            end
 +        end
 +    end
 +end
 +
 +
 +if size(filename,1)>1  % is padded string
 +    if blockRead
 +        warning('Bioinfo:fastaread:IgnoredRange',...
 +            'BLOCKREAD is only allowed when parsing a file.')
 +    end
 +    ftext = cell(size(filename,1),1);
 +    for i=1:size(filename,1)
 +        ftext(i,1)=strtrim(strread(filename(i,:),'%s','whitespace','','delimiter','\n'));
 +    end
 +    % try then if it is an url
 +elseif (strfind(filename(1:min(10,end)), '://'))
 +    if blockRead
 +        warning('Bioinfo:fastaread:IgnoredRange',...
 +            'BLOCKREAD is only allowed when parsing a file.')
 +    end
 +    if (~usejava('jvm'))
 +        error('Bioinfo:fastaread:NoJava',...
 +            'Reading from a URL requires Java.')
 +    end
 +    try
 +        ftext = urlread(filename);
 +    catch allExceptions
 +        error('Bioinfo:fastaread:CannotReadURL',...
 +            'Cannot read URL "%s".', filename);
 +    end
 +    ftext = strread(ftext,'%s','delimiter','\n');
 +
 +    % try then if it is a valid filename
 +elseif  (exist(filename,'file') || exist(fullfile(pwd,filename),'file'))
 +    if blockRead
 +        blockText = getfileblock(filename,range,'>');
 +        try
 +           ftext = strread(blockText,'%s','delimiter','\n');
 +        catch theErr
 +           if strcmpi(theErr.identifier,'MATLAB:nomem')
 +                error( 'Bioinfo:fastaread:BlockTooBig',...
 +                  'The block is too large to fit in memory. Try a smaller range.');
 +           else
 +                 rethrow(theErr);
 +           end
 +        end
 +    else %=== read entire file
 +        fid = fopen(filename);
 +		c = onCleanup(@()fclose(fid));
 +        try
 +            ftext = textscan(fid,'%s','delimiter','\n','bufsize',10000000);
 +            ftext = ftext{:}; 
 +        catch theErr
 +            if strcmpi(theErr.identifier,'MATLAB:nomem')
 +                error( 'Bioinfo:fastaread:FileTooBig',...
 +                    'The file is too large to fit in memory. Try using the BLOCKREAD option to read the file in blocks.');
 +            else 
 +                rethrow(theErr);
 +            end
 +        end
 +    end
 +else  % must be a string with '\n', convert to cell
 +    if blockRead
 +        warning('Bioinfo:fastaread:IgnoredRange',...
 +            'BLOCKREAD is only allowed when parsing a file.')
 +    end
 +    ftext = strread(filename,'%s','delimiter','\n');
 +end
 +
 +% it is possible that there will be multiple sequences
 +commentLines = strncmp(ftext,'>',1);
 +
 +if ~any(commentLines)
 +    error('Bioinfo:fastaread:FastaNotValid',...
 +        'Input does not exist or is not a valid FASTA file.')
 +end
 +
 +numSeqs = sum(commentLines);
 +seqStarts = [find(commentLines); size(ftext,1)+1];
 +data(numSeqs,1).Header = '';
 +
 +try
 +    for theSeq = 1:numSeqs
 +        % Check for > symbol ?
 +        data(theSeq).Header = ftext{seqStarts(theSeq)}(2:end);
 +        firstRow = seqStarts(theSeq)+1;
 +        lastRow = seqStarts(theSeq+1)-1;
 +        numChars = cellfun('length',ftext(firstRow:lastRow));
 +        numSymbols = sum(numChars);
 +        data(theSeq).Sequence = repmat(' ',1,numSymbols);
 +        pos = 1;
 +        for i=firstRow:lastRow,
 +            str = strtrim(ftext{i});
 +            len =  length(str);
 +            if len == 0
 +                break
 +            end
 +            data(theSeq).Sequence(pos:pos+len-1) = str;
 +            pos = pos+len;
 +        end
 +        data(theSeq).Sequence = strtrim(data(theSeq).Sequence);
 +        if ignoreGaps
 +            data(theSeq).Sequence = strrep(data(theSeq).Sequence,'-','');
 +            data(theSeq).Sequence = strrep(data(theSeq).Sequence,'.','');
 +        end
 +    end
 +
 +    % trim headers
 +    if trimh
 +       for i = 1:numSeqs
 +          data(i).Header = sscanf(data(i).Header,'%s',1);
 +       end
 +    end
 +    
 +    % in case of two outputs
 +    if nargout == 2
 +        if numSeqs == 1
 +            seq = data.Sequence;
 +            data = data.Header;
 +        else
 +            seq = {data(:).Sequence};
 +            data = {data(:).Header};
 +        end
 +    end
 +
 +catch allExceptions
 +    error('Bioinfo:fastaread:IncorrectDataFormat',...
 +        'Incorrect data format in fasta file')
 +end
 +
 diff --git a/src/matlab/quikr.m b/src/matlab/quikr.m new file mode 100644 index 0000000..d24596e --- /dev/null +++ b/src/matlab/quikr.m @@ -0,0 +1,52 @@ +function xstar = quikr(inputfasta)
 +%xstar=quikr('path/to/input/fasta/file') returns the estimated frequencies
 +%of bacteria present when given an input FASTA file of amplicon (454)
 +%reads. A k-mer based, L1-regularized, sparsity promoting algorithm is
 +%utilized. This is the default implementation of Quikr, here lambda=10,000, k-mer
 +%size is 6, and we use the trainset7_112011 database from RDP to estimate
 +%the present species. In practice, reconstruction is accurate only down to
 +%the genus level (not species or strain).
 +%The program ./count-linux or ./count-osx needs to be placed in the same 
 +%directory as this script. In Linux, you might need to: chmod a+rx count.
 +%Also the inputfasta needs to be a standard FASTA file. The input FASTA file must have reads all in the same
 +%orientation, so as a preprocessing step, make sure everything is in the
 +%forward (+) orientation.
 +if nargin>1
 +    error('too many input arguments');
 +end
 +if (isunix && not(ismac))   %If linux, use ./count-linux
 +    [status, counts]=unix(['./count-linux -r 6 -1 -u ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels
 +    if status ~= 0
 +        error('./count-linux failed: ensure count-linux is an executable. Try chmod a+rx count-linux. Be sure matlab/octave is in the same directory as count-linux');
 +    end
 +elseif ismac %If mac, then use ./count-osx
 +    [status, counts]=unix(['./count-osx -r 6 -1 -u ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels
 +    if status ~= 0
 +        error('./count-osx failed: ensure count-linux is an executable. Try chmod a+rx count-osx. Be sure matlab/octave is in the same directory as count-osx');
 +    end
 +elseif ispc
 +    error('Windows is not yet supported');
 +end    
 +counts=textscan(counts,'%f'); %convert into floats
 +counts=counts{:}; %make into a vector
 +counts=counts/sum(counts); %form a probability vector from the counts
 +lambda=10000; %this is the default lambda value (see the paper), and is used in the formation of trainset7_112011N6Aaux.mat
 +yaux=[0;lambda*counts]; % form the sample vector
 +load('trainset7_112011N6Aaux.mat') %load the 6-mer sensing matrix...I need to rename this so the variable is just Aaux.
 +if not(exist('Aaux2')) %make sure Aaux2 loaded
 +    error('The default training database trainset7_112011N6Aaux.mat failed to load. Be sure it exists in the Quikr folder or current directory');
 +end
 +%based on RDP's trainset 7 from 11/2011
 +isOctave = exist('OCTAVE_VERSION') ~= 0; %check to see if running Octave or Matlab
 +if isOctave %if octave, need to manually read in sparse-matrix format
 +    n=length(Aaux2.jc)-1;
 +    ii=repelems(1:n,[1:n; diff(Aaux2.jc)]);
 +    Aaux=sparse(Aaux2.ir+1,ii,Aaux2.data);
 +else
 +    Aaux=Aaux2; %fix the names
 +    clear Aaux2;
 +end %Matlab automatically reads it in correctly, just need to rename it
 +warning off
 +xstar=lsqnonneg(Aaux,yaux); %perform the nonnegative least squares
 +warning on
 +xstar=xstar/sum(xstar); %transform the output into a probability vector. Note this vector is on the same basis as Aaux, so the entries correspond to sequences in trainset7_112011.fa
\ No newline at end of file diff --git a/src/matlab/quikrCustomTrained.m b/src/matlab/quikrCustomTrained.m new file mode 100644 index 0000000..b58aa31 --- /dev/null +++ b/src/matlab/quikrCustomTrained.m @@ -0,0 +1,42 @@ +function xstar = quikrCustomTrained(trainingmatrix,inputfasta,k,lambda)
 +%xstar=quikrCustomTrained(traininmatrix,inputfasta,k,lambda) is the
 +%implementation of qukir that lets you use a custom training matrix
 +%"trainingmatrix" on the input data file "inputfasta" for the k-mer size
 +%"k" (note this k-mer size must be such that the number of rows of
 +%"trainingmatrix"=4^k), with the regularization value "lambda". The vector
 +%of predicted concentrations "xstar" is returned on the same basis as
 +%"trainingmatrix".
 +if nargin~=4
 +    error('There must be exactly 4 input arguments: the training matrix, the /path/to/input/fastafile, the k-mer size, and lambda');
 +end
 +
 +[rws, clumns]=size(trainingmatrix); %get the size of the training matrix
 +if rws~=4^k
 +    error('Wrong k-mer size for input training matrix');
 +end
 +
 +if (isunix && not(ismac))
 +    [status, counts]=unix([sprintf('./count-linux -r %d -1 -u ',k) ' ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels.
 +     if status ~= 0
 +        error('./count-linux failed: ensure count-linux is an executable. Try chmod a+rx count-linux. Be sure matlab/octave is in the same directory as count-linux');
 +    end
 +elseif ismac
 +    [status, counts]=unix([sprintf('./count-osx -r %d -1 -u ',k) ' ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels.
 +    if status ~= 0
 +        error('./count-osx failed: ensure count-linux is an executable. Try chmod a+rx count-osx. Be sure matlab/octave is in the same directory as count-osx');
 +    end
 +elseif ispc
 +   error('Windows is not yet supported');
 +end
 +
 +counts=textscan(counts,'%f'); %read them in as floats.
 +counts=counts{:};
 +counts=counts/sum(counts); %normalize the counts into a probability vector
 +yaux=[0;lambda*counts]; %form the sample vector
 +
 +
 +Aaux=[ones(1,clumns);lambda*trainingmatrix]; %form the k-mer sensing matrix
 +warning off
 +xstar=lsqnonneg(Aaux,yaux); %perform the non-negative lease squares
 +warning on
 +xstar=xstar/sum(xstar); %return the results as a probability vector 
\ No newline at end of file diff --git a/src/matlab/quikrTrain.m b/src/matlab/quikrTrain.m new file mode 100644 index 0000000..7960cca --- /dev/null +++ b/src/matlab/quikrTrain.m @@ -0,0 +1,41 @@ +function mat=quikrTrain(inputfasta,k)
 +%matrix=QuikrTrain(inputfasta,k) Returns the (sparse) k-mer sensing
 +%matrix from the FASTA file located at 'inputfasta' for the k-mer size k.
 +%This serves to retrain the Quikr method. The ouput can then be fed to
 +%quikrCustomTrained().
 +%Works for k=1:8 (typically the matrices get too large for k>9)
 +%The filename for the inputfasta must be the complete path.
 +
 +if nargin>2
 +    error('too many input arguments');
 +end
 +
 +
 +[pathtofile,filename,ext]=fileparts(inputfasta); %get the parts of the file
 +
 +
 +outputfilename=fullfile(pathtofile, [filename sprintf('-sensingmatrixK%d.txt',k)]); %Currently this is coded to write a temporary file. In future versions, this will be all be done in RAM
 +%The reason for writing the file to disk first is that Matlab typically
 +%crashes when unix() returns as many entries as ./probabilities-by-read
 +%does (on the order of ~2*10^10).
 +
 +kmerfilename=sprintf('%dmers.txt',k); %This contains the list of 6-mers to count. In future versions this will be computed locally instead of being read in.
 +
 +if (isunix && not(ismac)) %this is for the linux version
 +    unix(['./probabilities-by-read-linux ' sprintf('%d',k) ' ' inputfasta ' ' kmerfilename ' > ' outputfilename]); %obtain the k-mer counts of the inputfasta read-by-read
 +elseif ismac %mac version
 +    unix(['./probabilities-by-read-osx ' sprintf('%d',k) ' ' inputfasta ' ' kmerfilename ' > ' outputfilename]); %obtain the k-mer counts of the inputfasta read-by-read
 +elseif ispc %No PC version
 +    error('Windows is not yet supported');
 +end
 +
 +fid=fopen(outputfilename); %open the output file
 +
 +%A=textscan(fid,'%f'); %get all the counts
 +%A=A{:};
 +A=fscanf(fid,'%f');
 +mat=sparse(reshape(A,4^k,length(A)/4^k)); %form into a matrix
 +mat=bsxfun(@rdivide,mat,sum(mat,1)); %column-normalize
 +fclose(fid); %close file
 +delete(outputfilename); %delete the file
 +
 diff --git a/src/matlab/quikrWriteOutput.m b/src/matlab/quikrWriteOutput.m new file mode 100644 index 0000000..7dd5c3b --- /dev/null +++ b/src/matlab/quikrWriteOutput.m @@ -0,0 +1,89 @@ +function quikrWriteOutput(directory,filename,xstar,pathtotrainingdatabase,taxonomicrank)
 +%taxonomic rank: 1=kindom, 2=phylum, 3=class, 4=order, 5=family,
 +%6=genus, 7=species, 8=strain
 +%quikrWriteOutput(directory,filename,xstar,trainingdatabase,taxonomicrank)
 +%will output a Comma Seperated (CSV) file with the file name 'filename' in the directory
 +%'directory' that gives the summary of concentrations of the reconstruction
 +%at the taxonomic rank specified by 'taxonomicrank'. The database located 
 +%at'pathtotrainingdatabase' must be in the fasta format with the headers
 +%in the format output by TaxCollector.
 +%(see Giongo et al, 2010 TaxCollector: Modifying Current 16S rRNA
 +%Databases for the Rapid Classification at Six Taxonomic Levels, Diversity.
 +%doi:10.3390/d2071015).
 +%The output excel file is suitable for use in visualiztion software such as
 +%Krona (Ondov, Bergman, and Phillippy, 2011, Interactive Metagenomic
 +%Visualiztion in a Web Browser, BMC bioinformatics,
 +%doi:10.1186/1471-2105-12-385).
 +if nargin>5
 +    error('too many input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank');
 +end
 +if nargin<5
 +    error('too few input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank');
 +end
 +[headers,~]=fastaread(pathtotrainingdatabase);%read in the headers of the training database
 +
 +for i=1:length(headers)
 +    if not(length(regexp(headers{2},'\[\d\]'))==9)
 +        error(sprintf('The header in entry %d in the training database is not in the Tax Collector format: eg ">[1]kingdom[2]phylum[3]class[4]order[5]family[6]genus[7]species[8]strain|accession"',i));
 +    end
 +end
 +
 +
 +seqnames=headers(find(xstar)); %get the names of the sequences that are given nonzero concentration in the reconstruction xstar
 +basis=find(xstar); %the indicies that have nonzero concentrations
 +        %seqnamesnew=cell(length(basis),1);
 +for i=1:length(basis)
 +    seqnamesnew{i}=strcat(sprintf('%f[0] ',xstar(basis(i))),seqnames{i}); %format is concentration, and then the name as it is in the training database
 +end
 +        
 +for i=1:length(seqnamesnew)
 +    splitup(i,:)=regexp(seqnamesnew{i},'\[\d\]','split'); %split everything up on the token [%d] as per the TaxCollector format
 +end
 +%xlswrite(fullfile(directory,[filename '.xls']),splitup);
 +%so there's a problem: trainset7_112011 is not in the tax collector format.
 +%So one thing I could do is use the accessions or something like that to
 +%get the corresponding headers from TaxCollectorRDP10_28 and create a new
 +%trainset7_112011 (or at least, one with new headers that then work
 +%according to TaxCollector).
 +
 +%So it does look like that for each header in trainset7_112011, after the >
 +%and before the | that "accession" also appears at the end of some header in
 +%TaxCollectorRDP10_28 (after the | on the end). So I'd like to make a new
 +%trainset7_112011 in which the headers are simply replaced.
 +
 +
 +%Now I'm going to try to unique at the given taxanomic rank, and then tally
 +%up the concentrations of all the corresponding entries in splitup
 +ranknames=unique(splitup(:,taxonomicrank+2)); %get the unique names at the given taxonomic rank
 +concentrations=str2double(splitup(:,1)); %pull off the concentrations
 +       
 +for i=1:length(ranknames) %for each of the names
 +    selectlist=strcmp(splitup(:,taxonomicrank+2),ranknames(i)); %get the rows that have the given name in the proper taxonomic rank
 +    %newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),ranknames(i)}; %sum up the concentrations in all these rows and put that next to the names just down to the given taxonomic rank
 +    newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),splitup{find(selectlist,1),3:taxonomicrank+2}};
 +end
 +%xlswrite(fullfile(directory,[filename '.xls']),newsplitup); %write this as the final output
 +
 +fid=fopen(fullfile(directory,[filename '.csv']),'w');
 +if fid==-1
 +    error(sprintf('The file %s is currently write-protected (e.g. open or in use in a different program)', fullfile(directory,[filename '.csv'])));
 +end
 +[nrows,ncols]=size(newsplitup);
 +mystr=['%s,'];
 +for cols=1:ncols-2
 +    mystr=[mystr ' ' '%s,'];
 +end
 +%mystr=[mystr '\n'];
 +for row=1:nrows
 +   fprintf(fid,[mystr '%s\n'],newsplitup{row,:});
 +end
 +fclose(fid);
 +
 +
 +
 +
 +
 +
 +
 +
 +
 | 
