summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCalvin <calvin@EESI>2013-03-11 12:36:42 -0400
committerCalvin <calvin@EESI>2013-03-11 12:36:42 -0400
commit4ca6f92ceb4b2f8c504431cf56f8a6135187a61c (patch)
treebda30f1f0710d19660c04c57f31be37cf4865c50 /src
parentc18793c210ea44cef63db3568791cbad537bdb79 (diff)
adding matlab code
Diffstat (limited to 'src')
-rw-r--r--src/matlab/Example.m52
-rw-r--r--src/matlab/fastaread.m232
-rw-r--r--src/matlab/quikr.m52
-rw-r--r--src/matlab/quikrCustomTrained.m42
-rw-r--r--src/matlab/quikrTrain.m41
-rw-r--r--src/matlab/quikrWriteOutput.m89
6 files changed, 508 insertions, 0 deletions
diff --git a/src/matlab/Example.m b/src/matlab/Example.m
new file mode 100644
index 0000000..d2a3192
--- /dev/null
+++ b/src/matlab/Example.m
@@ -0,0 +1,52 @@
+% Here is an example of re-training the quikr method, performing the reconstruction, and outputing the results to a CSV file appropriate for Excel.
+mat=quikrTrain('TaxCollectortrainset7_112011.fa',6); %retrain using the TaxCollector version of trainset7_112011.fa
+xstar=quikrCustomTrained(mat,'testfastafile',6,10000); %perform the reconstruction
+quikrWriteOutput('/path/to/output/director','filenameforoutput',full(xstar),'/path/to/TaxCollectortrainset7_112011.fa',2) %write the output in CSV format, the last argument (2) corresponds to the phylum level. See the file quikrWriteOutput.m for more details.
+
+
+
+
+%This is an example of how to run Quikr
+
+%If you want to use the default training-set (that is, trainset7_112011.fa
+%from RPD), then execute the following
+
+%make sure Matlab/Octave is in the Quikr director
+cd /path/to/Quikr
+
+fastafilename='/path/to/your/fastafile.fasta'; %full path name to your data file
+xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
+
+
+%xstar will be on the same basis as trainset7_112011.fa, so to get the
+%sequences that are predicted to be present in your sample:
+[headers,~]=fastaread('trainset7_112011.fa'); %read in the training database
+%note fastaread is not by default included in Octave, the fastaread.m file
+%is included in the Quikr download directory however, and is directly
+%compatible with Matlab.
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
+%so to find which sequence is the most abundant in your mixture:
+[val,ind]=max(xstar(nonzeroentries)); %get the maximum value and it's position
+namesandproportions{1:2,ind} %note that this does not imply this specific strain or species is in your sample, just that phylum/class/order/family/genus this species belongs to is in your sample.
+
+
+% If you would like to use a custom training database, follow the following
+% steps
+fastafilename='/path/to/your/fastafile.fasta'; %full path name to your data file
+trainingdatabasefilename='/path/to/your/trainingdatabase.fasta'; %full path to the FASTA file you wish to use as a training database
+k=6; %pick a k-mer size
+trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
+%then to do the reconstuction
+lambda=10000; %pick a lambda (larger lambda -> theoretically predicted concentrations are closer to actual concentrations), this depends on k-mer size picked, also size and condition of the TrainingMatrix
+xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda); %get the predicted reconstruction frequencies
+
+%again Xstar is on the same basis as the TrainingMatrix, so to get the
+%sequences that are predicted to be present in your sample:
+[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
diff --git a/src/matlab/fastaread.m b/src/matlab/fastaread.m
new file mode 100644
index 0000000..37d9b71
--- /dev/null
+++ b/src/matlab/fastaread.m
@@ -0,0 +1,232 @@
+function [data, seq] = fastaread(filename,varargin)
+%FASTAREAD reads FASTA format file.
+%
+% S = FASTAREAD(FILENAME) reads a FASTA format file FILENAME, returning
+% the data in the file as a structure. FILENAME can also be a URL or
+% MATLAB character array that contains the text of a FASTA format file.
+% S.Header is the header information. S.Sequence is the sequence stored
+% as a string of characters.
+%
+% [HEADER, SEQ] = FASTAREAD(FILENAME) reads the file into separate
+% variables HEADER and SEQ. If the file contains more than one sequence,
+% then HEADER and SEQ are cell arrays of header and sequence information.
+%
+% FASTAREAD(...,'IGNOREGAPS',TF) removes any gap symbol ('-' or '.')
+% from the sequence(s) when TF is true. Default is false.
+%
+% FASTAREAD(...,'BLOCKREAD', M) allows you to read in a single entry or
+% block of entries from a file containing multiple sequences. If M is a
+% scalar then the M'th entry in the file is read. If M is a two element
+% vector then the block of entries starting at entry M(1) and ending at
+% entry M(2) will be read. Use Inf for M(2) to read all entries in the
+% file starting at position M(1).
+%
+% FASTAREAD(...,'TRIMHEADERS',TF) trims the header after the first
+% whitespace when TF is true. White space characters include a horizontal
+% tab (char(9)) and a space (char(32)). Default is false.
+%
+% FASTA format specified here:
+% http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
+%
+% Examples:
+%
+% % Read the sequence for the human p53 tumor gene.
+% p53nt = fastaread('p53nt.txt')
+%
+% % Read the sequence for the human p53 tumor protein.
+% p53aa = fastaread('p53aa.txt')
+%
+% % Read a block of entries from a file
+% pf2_5_10 = fastaread('pf00002.fa','blockread',[ 5 10], ...
+% 'ignoregaps',true)
+%
+% % Read the human mitochondrion genome in FASTA format.
+% entrezSite = 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?'
+% textOptions = '&txt=on&view=fasta'
+% genbankID = '&list_uids=NC_012920'
+% mitochondrion = fastaread([entrezSite textOptions genbankID])
+%
+% See also EMBLREAD, FASTAINFO, FASTAWRITE, FASTQINFO, FASTQREAD,
+% FASTQWRITE, GENBANKREAD, GENPEPTREAD, HMMPROFDEMO, MULTIALIGNREAD,
+% MULTIALIGNVIEWER, SEQPROFILE, SEQTOOL, SFFINFO, SFFREAD.
+
+% Copyright 2002-2010 The MathWorks, Inc.
+% $Revision: 1.15.4.21 $ $Date: 2010/06/26 04:54:38 $
+
+% check input is char
+% in a future version we may accept also cells
+if ~ischar(filename)
+ error('Bioinfo:fastaread:InvalidInput','Input must be a character array')
+end
+
+% default
+ignoreGaps = false;
+trimh = false;
+blockRead = false;
+% get input arguments
+if nargin > 1
+ if rem(nargin,2) ~= 1
+ error('Bioinfo:fastaread:IncorrectNumberOfArguments',...
+ 'Incorrect number of arguments to %s.',mfilename);
+ end
+ okargs = {'ignoregaps','blockread','trimheaders'};
+ for j=1:2:nargin-1
+ pname = varargin{j};
+ pval = varargin{j+1};
+ k = find(strncmpi(pname,okargs,numel(pname)));
+ if isempty(k)
+ error('Bioinfo:fastaread:UnknownParameterName',...
+ 'Unknown parameter name: %s.',pname);
+ %elseif length(k)>1
+ % error('Bioinfo:fastaread:AmbiguousParameterName',...
+ % 'Ambiguous parameter name: %s.',pname);
+ else
+ switch(k)
+ case 1 % ignore gaps
+ ignoreGaps = opttf(pval);
+ if isempty(ignoreGaps)
+ error('Bioinfo:fastaread:IgnoreGapsNotLogical',...
+ '%s must be a logical value, true or false.',...
+ upper(char(okargs(k))));
+ end
+ case 2 % range
+ range = pval;
+ if ~isnumeric(range) || numel(range)> 2 || isempty(range)
+ error('Bioinfo:fastaread:BadBlockRange',...
+ 'BlockRange must be numeric scalar or vector with two values.')
+ end
+ blockRead = true;
+ range = sort(range);
+ case 3 % trimheaders
+ trimh = opttf(pval, okargs{k}, mfilename);
+ end
+ end
+ end
+end
+
+
+if size(filename,1)>1 % is padded string
+ if blockRead
+ warning('Bioinfo:fastaread:IgnoredRange',...
+ 'BLOCKREAD is only allowed when parsing a file.')
+ end
+ ftext = cell(size(filename,1),1);
+ for i=1:size(filename,1)
+ ftext(i,1)=strtrim(strread(filename(i,:),'%s','whitespace','','delimiter','\n'));
+ end
+ % try then if it is an url
+elseif (strfind(filename(1:min(10,end)), '://'))
+ if blockRead
+ warning('Bioinfo:fastaread:IgnoredRange',...
+ 'BLOCKREAD is only allowed when parsing a file.')
+ end
+ if (~usejava('jvm'))
+ error('Bioinfo:fastaread:NoJava',...
+ 'Reading from a URL requires Java.')
+ end
+ try
+ ftext = urlread(filename);
+ catch allExceptions
+ error('Bioinfo:fastaread:CannotReadURL',...
+ 'Cannot read URL "%s".', filename);
+ end
+ ftext = strread(ftext,'%s','delimiter','\n');
+
+ % try then if it is a valid filename
+elseif (exist(filename,'file') || exist(fullfile(pwd,filename),'file'))
+ if blockRead
+ blockText = getfileblock(filename,range,'>');
+ try
+ ftext = strread(blockText,'%s','delimiter','\n');
+ catch theErr
+ if strcmpi(theErr.identifier,'MATLAB:nomem')
+ error( 'Bioinfo:fastaread:BlockTooBig',...
+ 'The block is too large to fit in memory. Try a smaller range.');
+ else
+ rethrow(theErr);
+ end
+ end
+ else %=== read entire file
+ fid = fopen(filename);
+ c = onCleanup(@()fclose(fid));
+ try
+ ftext = textscan(fid,'%s','delimiter','\n','bufsize',10000000);
+ ftext = ftext{:};
+ catch theErr
+ if strcmpi(theErr.identifier,'MATLAB:nomem')
+ error( 'Bioinfo:fastaread:FileTooBig',...
+ 'The file is too large to fit in memory. Try using the BLOCKREAD option to read the file in blocks.');
+ else
+ rethrow(theErr);
+ end
+ end
+ end
+else % must be a string with '\n', convert to cell
+ if blockRead
+ warning('Bioinfo:fastaread:IgnoredRange',...
+ 'BLOCKREAD is only allowed when parsing a file.')
+ end
+ ftext = strread(filename,'%s','delimiter','\n');
+end
+
+% it is possible that there will be multiple sequences
+commentLines = strncmp(ftext,'>',1);
+
+if ~any(commentLines)
+ error('Bioinfo:fastaread:FastaNotValid',...
+ 'Input does not exist or is not a valid FASTA file.')
+end
+
+numSeqs = sum(commentLines);
+seqStarts = [find(commentLines); size(ftext,1)+1];
+data(numSeqs,1).Header = '';
+
+try
+ for theSeq = 1:numSeqs
+ % Check for > symbol ?
+ data(theSeq).Header = ftext{seqStarts(theSeq)}(2:end);
+ firstRow = seqStarts(theSeq)+1;
+ lastRow = seqStarts(theSeq+1)-1;
+ numChars = cellfun('length',ftext(firstRow:lastRow));
+ numSymbols = sum(numChars);
+ data(theSeq).Sequence = repmat(' ',1,numSymbols);
+ pos = 1;
+ for i=firstRow:lastRow,
+ str = strtrim(ftext{i});
+ len = length(str);
+ if len == 0
+ break
+ end
+ data(theSeq).Sequence(pos:pos+len-1) = str;
+ pos = pos+len;
+ end
+ data(theSeq).Sequence = strtrim(data(theSeq).Sequence);
+ if ignoreGaps
+ data(theSeq).Sequence = strrep(data(theSeq).Sequence,'-','');
+ data(theSeq).Sequence = strrep(data(theSeq).Sequence,'.','');
+ end
+ end
+
+ % trim headers
+ if trimh
+ for i = 1:numSeqs
+ data(i).Header = sscanf(data(i).Header,'%s',1);
+ end
+ end
+
+ % in case of two outputs
+ if nargout == 2
+ if numSeqs == 1
+ seq = data.Sequence;
+ data = data.Header;
+ else
+ seq = {data(:).Sequence};
+ data = {data(:).Header};
+ end
+ end
+
+catch allExceptions
+ error('Bioinfo:fastaread:IncorrectDataFormat',...
+ 'Incorrect data format in fasta file')
+end
+
diff --git a/src/matlab/quikr.m b/src/matlab/quikr.m
new file mode 100644
index 0000000..d24596e
--- /dev/null
+++ b/src/matlab/quikr.m
@@ -0,0 +1,52 @@
+function xstar = quikr(inputfasta)
+%xstar=quikr('path/to/input/fasta/file') returns the estimated frequencies
+%of bacteria present when given an input FASTA file of amplicon (454)
+%reads. A k-mer based, L1-regularized, sparsity promoting algorithm is
+%utilized. This is the default implementation of Quikr, here lambda=10,000, k-mer
+%size is 6, and we use the trainset7_112011 database from RDP to estimate
+%the present species. In practice, reconstruction is accurate only down to
+%the genus level (not species or strain).
+%The program ./count-linux or ./count-osx needs to be placed in the same
+%directory as this script. In Linux, you might need to: chmod a+rx count.
+%Also the inputfasta needs to be a standard FASTA file. The input FASTA file must have reads all in the same
+%orientation, so as a preprocessing step, make sure everything is in the
+%forward (+) orientation.
+if nargin>1
+ error('too many input arguments');
+end
+if (isunix && not(ismac)) %If linux, use ./count-linux
+ [status, counts]=unix(['./count-linux -r 6 -1 -u ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels
+ if status ~= 0
+ error('./count-linux failed: ensure count-linux is an executable. Try chmod a+rx count-linux. Be sure matlab/octave is in the same directory as count-linux');
+ end
+elseif ismac %If mac, then use ./count-osx
+ [status, counts]=unix(['./count-osx -r 6 -1 -u ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels
+ if status ~= 0
+ error('./count-osx failed: ensure count-linux is an executable. Try chmod a+rx count-osx. Be sure matlab/octave is in the same directory as count-osx');
+ end
+elseif ispc
+ error('Windows is not yet supported');
+end
+counts=textscan(counts,'%f'); %convert into floats
+counts=counts{:}; %make into a vector
+counts=counts/sum(counts); %form a probability vector from the counts
+lambda=10000; %this is the default lambda value (see the paper), and is used in the formation of trainset7_112011N6Aaux.mat
+yaux=[0;lambda*counts]; % form the sample vector
+load('trainset7_112011N6Aaux.mat') %load the 6-mer sensing matrix...I need to rename this so the variable is just Aaux.
+if not(exist('Aaux2')) %make sure Aaux2 loaded
+ error('The default training database trainset7_112011N6Aaux.mat failed to load. Be sure it exists in the Quikr folder or current directory');
+end
+%based on RDP's trainset 7 from 11/2011
+isOctave = exist('OCTAVE_VERSION') ~= 0; %check to see if running Octave or Matlab
+if isOctave %if octave, need to manually read in sparse-matrix format
+ n=length(Aaux2.jc)-1;
+ ii=repelems(1:n,[1:n; diff(Aaux2.jc)]);
+ Aaux=sparse(Aaux2.ir+1,ii,Aaux2.data);
+else
+ Aaux=Aaux2; %fix the names
+ clear Aaux2;
+end %Matlab automatically reads it in correctly, just need to rename it
+warning off
+xstar=lsqnonneg(Aaux,yaux); %perform the nonnegative least squares
+warning on
+xstar=xstar/sum(xstar); %transform the output into a probability vector. Note this vector is on the same basis as Aaux, so the entries correspond to sequences in trainset7_112011.fa \ No newline at end of file
diff --git a/src/matlab/quikrCustomTrained.m b/src/matlab/quikrCustomTrained.m
new file mode 100644
index 0000000..b58aa31
--- /dev/null
+++ b/src/matlab/quikrCustomTrained.m
@@ -0,0 +1,42 @@
+function xstar = quikrCustomTrained(trainingmatrix,inputfasta,k,lambda)
+%xstar=quikrCustomTrained(traininmatrix,inputfasta,k,lambda) is the
+%implementation of qukir that lets you use a custom training matrix
+%"trainingmatrix" on the input data file "inputfasta" for the k-mer size
+%"k" (note this k-mer size must be such that the number of rows of
+%"trainingmatrix"=4^k), with the regularization value "lambda". The vector
+%of predicted concentrations "xstar" is returned on the same basis as
+%"trainingmatrix".
+if nargin~=4
+ error('There must be exactly 4 input arguments: the training matrix, the /path/to/input/fastafile, the k-mer size, and lambda');
+end
+
+[rws, clumns]=size(trainingmatrix); %get the size of the training matrix
+if rws~=4^k
+ error('Wrong k-mer size for input training matrix');
+end
+
+if (isunix && not(ismac))
+ [status, counts]=unix([sprintf('./count-linux -r %d -1 -u ',k) ' ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels.
+ if status ~= 0
+ error('./count-linux failed: ensure count-linux is an executable. Try chmod a+rx count-linux. Be sure matlab/octave is in the same directory as count-linux');
+ end
+elseif ismac
+ [status, counts]=unix([sprintf('./count-osx -r %d -1 -u ',k) ' ' inputfasta]); %count the 6-mers in the fasta file, in the forward direction, return the counts without labels.
+ if status ~= 0
+ error('./count-osx failed: ensure count-linux is an executable. Try chmod a+rx count-osx. Be sure matlab/octave is in the same directory as count-osx');
+ end
+elseif ispc
+ error('Windows is not yet supported');
+end
+
+counts=textscan(counts,'%f'); %read them in as floats.
+counts=counts{:};
+counts=counts/sum(counts); %normalize the counts into a probability vector
+yaux=[0;lambda*counts]; %form the sample vector
+
+
+Aaux=[ones(1,clumns);lambda*trainingmatrix]; %form the k-mer sensing matrix
+warning off
+xstar=lsqnonneg(Aaux,yaux); %perform the non-negative lease squares
+warning on
+xstar=xstar/sum(xstar); %return the results as a probability vector \ No newline at end of file
diff --git a/src/matlab/quikrTrain.m b/src/matlab/quikrTrain.m
new file mode 100644
index 0000000..7960cca
--- /dev/null
+++ b/src/matlab/quikrTrain.m
@@ -0,0 +1,41 @@
+function mat=quikrTrain(inputfasta,k)
+%matrix=QuikrTrain(inputfasta,k) Returns the (sparse) k-mer sensing
+%matrix from the FASTA file located at 'inputfasta' for the k-mer size k.
+%This serves to retrain the Quikr method. The ouput can then be fed to
+%quikrCustomTrained().
+%Works for k=1:8 (typically the matrices get too large for k>9)
+%The filename for the inputfasta must be the complete path.
+
+if nargin>2
+ error('too many input arguments');
+end
+
+
+[pathtofile,filename,ext]=fileparts(inputfasta); %get the parts of the file
+
+
+outputfilename=fullfile(pathtofile, [filename sprintf('-sensingmatrixK%d.txt',k)]); %Currently this is coded to write a temporary file. In future versions, this will be all be done in RAM
+%The reason for writing the file to disk first is that Matlab typically
+%crashes when unix() returns as many entries as ./probabilities-by-read
+%does (on the order of ~2*10^10).
+
+kmerfilename=sprintf('%dmers.txt',k); %This contains the list of 6-mers to count. In future versions this will be computed locally instead of being read in.
+
+if (isunix && not(ismac)) %this is for the linux version
+ unix(['./probabilities-by-read-linux ' sprintf('%d',k) ' ' inputfasta ' ' kmerfilename ' > ' outputfilename]); %obtain the k-mer counts of the inputfasta read-by-read
+elseif ismac %mac version
+ unix(['./probabilities-by-read-osx ' sprintf('%d',k) ' ' inputfasta ' ' kmerfilename ' > ' outputfilename]); %obtain the k-mer counts of the inputfasta read-by-read
+elseif ispc %No PC version
+ error('Windows is not yet supported');
+end
+
+fid=fopen(outputfilename); %open the output file
+
+%A=textscan(fid,'%f'); %get all the counts
+%A=A{:};
+A=fscanf(fid,'%f');
+mat=sparse(reshape(A,4^k,length(A)/4^k)); %form into a matrix
+mat=bsxfun(@rdivide,mat,sum(mat,1)); %column-normalize
+fclose(fid); %close file
+delete(outputfilename); %delete the file
+
diff --git a/src/matlab/quikrWriteOutput.m b/src/matlab/quikrWriteOutput.m
new file mode 100644
index 0000000..7dd5c3b
--- /dev/null
+++ b/src/matlab/quikrWriteOutput.m
@@ -0,0 +1,89 @@
+function quikrWriteOutput(directory,filename,xstar,pathtotrainingdatabase,taxonomicrank)
+%taxonomic rank: 1=kindom, 2=phylum, 3=class, 4=order, 5=family,
+%6=genus, 7=species, 8=strain
+%quikrWriteOutput(directory,filename,xstar,trainingdatabase,taxonomicrank)
+%will output a Comma Seperated (CSV) file with the file name 'filename' in the directory
+%'directory' that gives the summary of concentrations of the reconstruction
+%at the taxonomic rank specified by 'taxonomicrank'. The database located
+%at'pathtotrainingdatabase' must be in the fasta format with the headers
+%in the format output by TaxCollector.
+%(see Giongo et al, 2010 TaxCollector: Modifying Current 16S rRNA
+%Databases for the Rapid Classification at Six Taxonomic Levels, Diversity.
+%doi:10.3390/d2071015).
+%The output excel file is suitable for use in visualiztion software such as
+%Krona (Ondov, Bergman, and Phillippy, 2011, Interactive Metagenomic
+%Visualiztion in a Web Browser, BMC bioinformatics,
+%doi:10.1186/1471-2105-12-385).
+if nargin>5
+ error('too many input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank');
+end
+if nargin<5
+ error('too few input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank');
+end
+[headers,~]=fastaread(pathtotrainingdatabase);%read in the headers of the training database
+
+for i=1:length(headers)
+ if not(length(regexp(headers{2},'\[\d\]'))==9)
+ error(sprintf('The header in entry %d in the training database is not in the Tax Collector format: eg ">[1]kingdom[2]phylum[3]class[4]order[5]family[6]genus[7]species[8]strain|accession"',i));
+ end
+end
+
+
+seqnames=headers(find(xstar)); %get the names of the sequences that are given nonzero concentration in the reconstruction xstar
+basis=find(xstar); %the indicies that have nonzero concentrations
+ %seqnamesnew=cell(length(basis),1);
+for i=1:length(basis)
+ seqnamesnew{i}=strcat(sprintf('%f[0] ',xstar(basis(i))),seqnames{i}); %format is concentration, and then the name as it is in the training database
+end
+
+for i=1:length(seqnamesnew)
+ splitup(i,:)=regexp(seqnamesnew{i},'\[\d\]','split'); %split everything up on the token [%d] as per the TaxCollector format
+end
+%xlswrite(fullfile(directory,[filename '.xls']),splitup);
+%so there's a problem: trainset7_112011 is not in the tax collector format.
+%So one thing I could do is use the accessions or something like that to
+%get the corresponding headers from TaxCollectorRDP10_28 and create a new
+%trainset7_112011 (or at least, one with new headers that then work
+%according to TaxCollector).
+
+%So it does look like that for each header in trainset7_112011, after the >
+%and before the | that "accession" also appears at the end of some header in
+%TaxCollectorRDP10_28 (after the | on the end). So I'd like to make a new
+%trainset7_112011 in which the headers are simply replaced.
+
+
+%Now I'm going to try to unique at the given taxanomic rank, and then tally
+%up the concentrations of all the corresponding entries in splitup
+ranknames=unique(splitup(:,taxonomicrank+2)); %get the unique names at the given taxonomic rank
+concentrations=str2double(splitup(:,1)); %pull off the concentrations
+
+for i=1:length(ranknames) %for each of the names
+ selectlist=strcmp(splitup(:,taxonomicrank+2),ranknames(i)); %get the rows that have the given name in the proper taxonomic rank
+ %newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),ranknames(i)}; %sum up the concentrations in all these rows and put that next to the names just down to the given taxonomic rank
+ newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),splitup{find(selectlist,1),3:taxonomicrank+2}};
+end
+%xlswrite(fullfile(directory,[filename '.xls']),newsplitup); %write this as the final output
+
+fid=fopen(fullfile(directory,[filename '.csv']),'w');
+if fid==-1
+ error(sprintf('The file %s is currently write-protected (e.g. open or in use in a different program)', fullfile(directory,[filename '.csv'])));
+end
+[nrows,ncols]=size(newsplitup);
+mystr=['%s,'];
+for cols=1:ncols-2
+ mystr=[mystr ' ' '%s,'];
+end
+%mystr=[mystr '\n'];
+for row=1:nrows
+ fprintf(fid,[mystr '%s\n'],newsplitup{row,:});
+end
+fclose(fid);
+
+
+
+
+
+
+
+
+