From 4ca6f92ceb4b2f8c504431cf56f8a6135187a61c Mon Sep 17 00:00:00 2001 From: Calvin Date: Mon, 11 Mar 2013 12:36:42 -0400 Subject: adding matlab code --- src/matlab/quikrWriteOutput.m | 89 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 src/matlab/quikrWriteOutput.m (limited to 'src/matlab/quikrWriteOutput.m') diff --git a/src/matlab/quikrWriteOutput.m b/src/matlab/quikrWriteOutput.m new file mode 100644 index 0000000..7dd5c3b --- /dev/null +++ b/src/matlab/quikrWriteOutput.m @@ -0,0 +1,89 @@ +function quikrWriteOutput(directory,filename,xstar,pathtotrainingdatabase,taxonomicrank) +%taxonomic rank: 1=kindom, 2=phylum, 3=class, 4=order, 5=family, +%6=genus, 7=species, 8=strain +%quikrWriteOutput(directory,filename,xstar,trainingdatabase,taxonomicrank) +%will output a Comma Seperated (CSV) file with the file name 'filename' in the directory +%'directory' that gives the summary of concentrations of the reconstruction +%at the taxonomic rank specified by 'taxonomicrank'. The database located +%at'pathtotrainingdatabase' must be in the fasta format with the headers +%in the format output by TaxCollector. +%(see Giongo et al, 2010 TaxCollector: Modifying Current 16S rRNA +%Databases for the Rapid Classification at Six Taxonomic Levels, Diversity. +%doi:10.3390/d2071015). +%The output excel file is suitable for use in visualiztion software such as +%Krona (Ondov, Bergman, and Phillippy, 2011, Interactive Metagenomic +%Visualiztion in a Web Browser, BMC bioinformatics, +%doi:10.1186/1471-2105-12-385). +if nargin>5 + error('too many input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank'); +end +if nargin<5 + error('too few input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank'); +end +[headers,~]=fastaread(pathtotrainingdatabase);%read in the headers of the training database + +for i=1:length(headers) + if not(length(regexp(headers{2},'\[\d\]'))==9) + error(sprintf('The header in entry %d in the training database is not in the Tax Collector format: eg ">[1]kingdom[2]phylum[3]class[4]order[5]family[6]genus[7]species[8]strain|accession"',i)); + end +end + + +seqnames=headers(find(xstar)); %get the names of the sequences that are given nonzero concentration in the reconstruction xstar +basis=find(xstar); %the indicies that have nonzero concentrations + %seqnamesnew=cell(length(basis),1); +for i=1:length(basis) + seqnamesnew{i}=strcat(sprintf('%f[0] ',xstar(basis(i))),seqnames{i}); %format is concentration, and then the name as it is in the training database +end + +for i=1:length(seqnamesnew) + splitup(i,:)=regexp(seqnamesnew{i},'\[\d\]','split'); %split everything up on the token [%d] as per the TaxCollector format +end +%xlswrite(fullfile(directory,[filename '.xls']),splitup); +%so there's a problem: trainset7_112011 is not in the tax collector format. +%So one thing I could do is use the accessions or something like that to +%get the corresponding headers from TaxCollectorRDP10_28 and create a new +%trainset7_112011 (or at least, one with new headers that then work +%according to TaxCollector). + +%So it does look like that for each header in trainset7_112011, after the > +%and before the | that "accession" also appears at the end of some header in +%TaxCollectorRDP10_28 (after the | on the end). So I'd like to make a new +%trainset7_112011 in which the headers are simply replaced. + + +%Now I'm going to try to unique at the given taxanomic rank, and then tally +%up the concentrations of all the corresponding entries in splitup +ranknames=unique(splitup(:,taxonomicrank+2)); %get the unique names at the given taxonomic rank +concentrations=str2double(splitup(:,1)); %pull off the concentrations + +for i=1:length(ranknames) %for each of the names + selectlist=strcmp(splitup(:,taxonomicrank+2),ranknames(i)); %get the rows that have the given name in the proper taxonomic rank + %newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),ranknames(i)}; %sum up the concentrations in all these rows and put that next to the names just down to the given taxonomic rank + newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),splitup{find(selectlist,1),3:taxonomicrank+2}}; +end +%xlswrite(fullfile(directory,[filename '.xls']),newsplitup); %write this as the final output + +fid=fopen(fullfile(directory,[filename '.csv']),'w'); +if fid==-1 + error(sprintf('The file %s is currently write-protected (e.g. open or in use in a different program)', fullfile(directory,[filename '.csv']))); +end +[nrows,ncols]=size(newsplitup); +mystr=['%s,']; +for cols=1:ncols-2 + mystr=[mystr ' ' '%s,']; +end +%mystr=[mystr '\n']; +for row=1:nrows + fprintf(fid,[mystr '%s\n'],newsplitup{row,:}); +end +fclose(fid); + + + + + + + + + -- cgit v1.2.3