aboutsummaryrefslogtreecommitdiff
path: root/src/matlab/quikrWriteOutput.m
diff options
context:
space:
mode:
authorCalvin <calvin@EESI>2013-03-11 12:36:42 -0400
committerCalvin <calvin@EESI>2013-03-11 12:36:42 -0400
commit4ca6f92ceb4b2f8c504431cf56f8a6135187a61c (patch)
treebda30f1f0710d19660c04c57f31be37cf4865c50 /src/matlab/quikrWriteOutput.m
parentc18793c210ea44cef63db3568791cbad537bdb79 (diff)
adding matlab code
Diffstat (limited to 'src/matlab/quikrWriteOutput.m')
-rw-r--r--src/matlab/quikrWriteOutput.m89
1 files changed, 89 insertions, 0 deletions
diff --git a/src/matlab/quikrWriteOutput.m b/src/matlab/quikrWriteOutput.m
new file mode 100644
index 0000000..7dd5c3b
--- /dev/null
+++ b/src/matlab/quikrWriteOutput.m
@@ -0,0 +1,89 @@
+function quikrWriteOutput(directory,filename,xstar,pathtotrainingdatabase,taxonomicrank)
+%taxonomic rank: 1=kindom, 2=phylum, 3=class, 4=order, 5=family,
+%6=genus, 7=species, 8=strain
+%quikrWriteOutput(directory,filename,xstar,trainingdatabase,taxonomicrank)
+%will output a Comma Seperated (CSV) file with the file name 'filename' in the directory
+%'directory' that gives the summary of concentrations of the reconstruction
+%at the taxonomic rank specified by 'taxonomicrank'. The database located
+%at'pathtotrainingdatabase' must be in the fasta format with the headers
+%in the format output by TaxCollector.
+%(see Giongo et al, 2010 TaxCollector: Modifying Current 16S rRNA
+%Databases for the Rapid Classification at Six Taxonomic Levels, Diversity.
+%doi:10.3390/d2071015).
+%The output excel file is suitable for use in visualiztion software such as
+%Krona (Ondov, Bergman, and Phillippy, 2011, Interactive Metagenomic
+%Visualiztion in a Web Browser, BMC bioinformatics,
+%doi:10.1186/1471-2105-12-385).
+if nargin>5
+ error('too many input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank');
+end
+if nargin<5
+ error('too few input arguments. Needs: directory, output filename, xstar, path to training database, and taxonomic rank');
+end
+[headers,~]=fastaread(pathtotrainingdatabase);%read in the headers of the training database
+
+for i=1:length(headers)
+ if not(length(regexp(headers{2},'\[\d\]'))==9)
+ error(sprintf('The header in entry %d in the training database is not in the Tax Collector format: eg ">[1]kingdom[2]phylum[3]class[4]order[5]family[6]genus[7]species[8]strain|accession"',i));
+ end
+end
+
+
+seqnames=headers(find(xstar)); %get the names of the sequences that are given nonzero concentration in the reconstruction xstar
+basis=find(xstar); %the indicies that have nonzero concentrations
+ %seqnamesnew=cell(length(basis),1);
+for i=1:length(basis)
+ seqnamesnew{i}=strcat(sprintf('%f[0] ',xstar(basis(i))),seqnames{i}); %format is concentration, and then the name as it is in the training database
+end
+
+for i=1:length(seqnamesnew)
+ splitup(i,:)=regexp(seqnamesnew{i},'\[\d\]','split'); %split everything up on the token [%d] as per the TaxCollector format
+end
+%xlswrite(fullfile(directory,[filename '.xls']),splitup);
+%so there's a problem: trainset7_112011 is not in the tax collector format.
+%So one thing I could do is use the accessions or something like that to
+%get the corresponding headers from TaxCollectorRDP10_28 and create a new
+%trainset7_112011 (or at least, one with new headers that then work
+%according to TaxCollector).
+
+%So it does look like that for each header in trainset7_112011, after the >
+%and before the | that "accession" also appears at the end of some header in
+%TaxCollectorRDP10_28 (after the | on the end). So I'd like to make a new
+%trainset7_112011 in which the headers are simply replaced.
+
+
+%Now I'm going to try to unique at the given taxanomic rank, and then tally
+%up the concentrations of all the corresponding entries in splitup
+ranknames=unique(splitup(:,taxonomicrank+2)); %get the unique names at the given taxonomic rank
+concentrations=str2double(splitup(:,1)); %pull off the concentrations
+
+for i=1:length(ranknames) %for each of the names
+ selectlist=strcmp(splitup(:,taxonomicrank+2),ranknames(i)); %get the rows that have the given name in the proper taxonomic rank
+ %newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),ranknames(i)}; %sum up the concentrations in all these rows and put that next to the names just down to the given taxonomic rank
+ newsplitup(i,:)={num2str(sum(concentrations(selectlist)),'%f'),splitup{find(selectlist,1),3:taxonomicrank+2}};
+end
+%xlswrite(fullfile(directory,[filename '.xls']),newsplitup); %write this as the final output
+
+fid=fopen(fullfile(directory,[filename '.csv']),'w');
+if fid==-1
+ error(sprintf('The file %s is currently write-protected (e.g. open or in use in a different program)', fullfile(directory,[filename '.csv'])));
+end
+[nrows,ncols]=size(newsplitup);
+mystr=['%s,'];
+for cols=1:ncols-2
+ mystr=[mystr ' ' '%s,'];
+end
+%mystr=[mystr '\n'];
+for row=1:nrows
+ fprintf(fid,[mystr '%s\n'],newsplitup{row,:});
+end
+fclose(fid);
+
+
+
+
+
+
+
+
+