From 60ee644d6f1add6722eb8444385f203c6110975a Mon Sep 17 00:00:00 2001
From: Calvin Morrison <mutantturkey@gmail.com>
Date: Wed, 12 Jun 2013 09:10:25 -0400
Subject: Matlab: move multifasta2otu into the main quikr folder

---
 src/matlab/README_for_multifasta2otu               |  27 +++
 src/matlab/multifasta2otu/README                   |  27 ---
 .../multifasta2otu/multifasta2otutable_gg94_2011.m | 219 ---------------------
 .../multifasta2otu/multifasta2otutable_rdp7.m      | 212 --------------------
 src/matlab/multifasta2otutable_gg94_2011.m         | 219 +++++++++++++++++++++
 src/matlab/multifasta2otutable_rdp7.m              | 212 ++++++++++++++++++++
 6 files changed, 458 insertions(+), 458 deletions(-)
 create mode 100644 src/matlab/README_for_multifasta2otu
 delete mode 100644 src/matlab/multifasta2otu/README
 delete mode 100644 src/matlab/multifasta2otu/multifasta2otutable_gg94_2011.m
 delete mode 100644 src/matlab/multifasta2otu/multifasta2otutable_rdp7.m
 create mode 100644 src/matlab/multifasta2otutable_gg94_2011.m
 create mode 100644 src/matlab/multifasta2otutable_rdp7.m

(limited to 'src/matlab')
diff --git a/src/matlab/README_for_multifasta2otu b/src/matlab/README_for_multifasta2otu
new file mode 100644
index 0000000..05a26c6
--- /dev/null
+++ b/src/matlab/README_for_multifasta2otu
@@ -0,0 +1,27 @@
+* Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013
+
+Usage tips:
+* Please name fasta files of sample reads with <sample id>.fa<*> and place them into one directory without any other file in that directory (for example, no hidden files that the operating system may generate, are allowed in that directory)
+* Note: When making your QIIME Metadata file, the sample id's must match the fasta file prefix names
+* Fasta files of reads must have a suffix that starts with .fa (e.g.: .fasta and .fa are valid while .fna is NOT)
+* Modify the top of the Matlab/Octave scripts for <input_directory>, <output_directory>, <output_filename>, and <training_database_filename>
+
+To use with QIIME, one must run the QIIME conversion tool on our OTU table output:
+convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
+
+---------------------------
+
+4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs:
+(Note: Our code works much better with WEIGHTED Unifrac as opposed to 
+Unweighted.)
+
+Pre-requisites:
+1. <quikr_otu_table.txt>
+2. the tree of the database sequences that were used (e.g. rdp7_mafft.fasttree, gg_94_otus_4feb2011.tre. they are in the data directory)
+3. your-defined <qiime_metadata_file.txt>
+
+1. convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
+2. beta_diversity.py -i <quikr_otu>.biom -m weighted_unifrac -o beta_div -t <tree file (example: rdp7_mafft.fasttree)>
+3. principal_coordinates.py -i beta_div/weighted_unifrac_<quikr_otu>.txt -o <quikr_otu_project_name>_weighted.txt
+4. make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirectory> -m <qiime_metadata_file>
+
diff --git a/src/matlab/multifasta2otu/README b/src/matlab/multifasta2otu/README
deleted file mode 100644
index 05a26c6..0000000
--- a/src/matlab/multifasta2otu/README
+++ /dev/null
@@ -1,27 +0,0 @@
-* Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013
-
-Usage tips:
-* Please name fasta files of sample reads with <sample id>.fa<*> and place them into one directory without any other file in that directory (for example, no hidden files that the operating system may generate, are allowed in that directory)
-* Note: When making your QIIME Metadata file, the sample id's must match the fasta file prefix names
-* Fasta files of reads must have a suffix that starts with .fa (e.g.: .fasta and .fa are valid while .fna is NOT)
-* Modify the top of the Matlab/Octave scripts for <input_directory>, <output_directory>, <output_filename>, and <training_database_filename>
-
-To use with QIIME, one must run the QIIME conversion tool on our OTU table output:
-convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
-
----------------------------
-
-4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs:
-(Note: Our code works much better with WEIGHTED Unifrac as opposed to 
-Unweighted.)
-
-Pre-requisites:
-1. <quikr_otu_table.txt>
-2. the tree of the database sequences that were used (e.g. rdp7_mafft.fasttree, gg_94_otus_4feb2011.tre. they are in the data directory)
-3. your-defined <qiime_metadata_file.txt>
-
-1. convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
-2. beta_diversity.py -i <quikr_otu>.biom -m weighted_unifrac -o beta_div -t <tree file (example: rdp7_mafft.fasttree)>
-3. principal_coordinates.py -i beta_div/weighted_unifrac_<quikr_otu>.txt -o <quikr_otu_project_name>_weighted.txt
-4. make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirectory> -m <qiime_metadata_file>
-
diff --git a/src/matlab/multifasta2otu/multifasta2otutable_gg94_2011.m b/src/matlab/multifasta2otu/multifasta2otutable_gg94_2011.m
deleted file mode 100644
index 9078487..0000000
--- a/src/matlab/multifasta2otu/multifasta2otutable_gg94_2011.m
+++ /dev/null
@@ -1,219 +0,0 @@
-% Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013
-%This is an example of how to run Multifasta Quikr with a custom 
-%training database (in this case Greengenes OTU's within 94% identity)
-
-%make sure Matlab/Octave is in your path
-%cd /path/to/Quikr
-
-%User-defined variables
-input_directory='../separated_samples'; %path to input directory of samples
-output_directory='quikr_results'; %path to where want output files to go
-otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename
-trainingdatabasefilename='../../../data/gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database
-
-
-mkdir([output_directory]);
-thedirs=dir([input_directory]);
-thetime=zeros(numel(thedirs)-1,1);
-names={};
-
-if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab
-
-%This is Octave Version
-
-tic();
-k=6; %pick a k-mer size
-trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
-disp('Training time:')
-[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
-lambda=10000; 
-training_time=toc()
-
-species=struct();
-keys={};
-
-tic();
-
-
-i=0;
-%for numdirs=3:5
-for numdirs=3:numel(thedirs)
-i=i+1;
-disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
-fastafilename=[input_directory '/' thedirs(numdirs).name];
-[loadfasta,~]=fastaread(fastafilename);
-numreads=numel(loadfasta);
-xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
-
-nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
-proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
-namescell=headers(nonzeroentries); %Get the names of the sequences
-namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
-
-[a cols]=size(namesandproportions);
-amount=zeros(cols,1);
-for j=1:cols
-  names{j}=['s' namesandproportions{1,j}];
-  amount(j)=namesandproportions{2,j};
-  if strcmp(keys,names{j})
-	temp=species.(names{j});
-      	temp(i)=round(amount(j).*numreads);
-      	species.(names{j})=temp;
-  else
-      temp=zeros(numel(thedirs)-3+1,1);
-      temp(i)=round(amount(j).*numreads);
-      if temp(i)==0
-         %insignificant counts, do nothing
-      else
-         species.(names{j})=temp;
-         keys{end+1}=names{j};
-      end
-  end
-end
-
-thefa=strfind(thedirs(numdirs).name,'.fa');
-
-if ~isempty(thedirs(numdirs).name(1:thefa-1))
-	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
-else
-	sampleid{i}='empty_sampleid';
-end
-
-thetime(i+1)=toc();
-thetime(i+1)
-
-end
-
-disp('Total time to compute Quikr:')
-toc()
-disp('Quikr Average time per file:')
-mean(diff(thetime(1:i+1)))
-
-tic()
-numits=i;
-
-fid=fopen([output_directory '/' otu_table_name],'w');
-fprintf(fid,'# QIIME vGail OTU table\n');
-fprintf(fid,'#OTU_ID\t');
-for i=1:numits
-if i<numits
-fprintf(fid,'%s\t',sampleid{i});
-else
-fprintf(fid,'%s',sampleid{i});
-end
-end
-fprintf(fid,'\n');
-
-for k=1:numel(keys)
- fprintf(fid,'%s',keys{k}(2:end))
- temp(:,k)=species.(keys{k});
-        for i=1:numits
-                fprintf(fid,'\t%d',temp(i,k));
-        end
-fprintf(fid,'\n');
-end
-fclose(fid);
-
-disp('Time to output OTU Table:')
-toc()
-
-else
-
-%This is Matlab Version
-
-tic()
-k=6; %pick a k-mer size
-trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
-'Training time:'
-[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
-lambda=10000; 
-training_time=toc()
-
-species=containers.Map;
-
-tic()
-
-
-i=0;
-%for numdirs=3:5
-for numdirs=3:numel(thedirs)
-i=i+1;
-[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
-fastafilename=[input_directory '/' thedirs(numdirs).name];
-[loadfasta,~]=fastaread(fastafilename);
-numreads=numel(loadfasta);
-xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
-
-nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
-proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
-namescell=headers(nonzeroentries); %Get the names of the sequences
-namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
-
-[a cols]=size(namesandproportions);
-amount=zeros(cols,1);
-for j=1:cols
-  names{j}=namesandproportions{1,j};
-  amount(j)=namesandproportions{2,j};
-  if isKey(species,names{j})
-	temp=species(names{j});
-      	temp(i)=round(amount(j).*numreads);
-      	species(names{j})=temp;
-  else
-      temp=zeros(numel(thedirs)-3+1,1);
-      temp(i)=round(amount(j).*numreads);
-      if temp(i)==0
-	%insigficant counts, do nothing
-      else
-	species(names{j})=temp;
-      end
-  end
-end
-
-thefa=strfind(thedirs(numdirs).name,'.fa');
-
-if ~isempty(thedirs(numdirs).name(1:thefa-1))
-	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
-else
-	sampleid{i}='empty_sampleid';
-end
-
-thetime(i+1)=toc();
-thetime(i+1)
-
-end
-
-'Total time to compute Quikr:'
-toc()
-'Quikr Average time per file:'
-mean(diff(thetime(1:i+1)))
-
-tic
-numits=i;
-
-fid=fopen([output_directory '/' otu_table_name],'w');
-fprintf(fid,'# QIIME vGail OTU table\n');
-fprintf(fid,'#OTU_ID\t');
-for i=1:numits
-if i<numits
-fprintf(fid,'%s\t',sampleid{i});
-else
-fprintf(fid,'%s',sampleid{i});
-end
-end
-fprintf(fid,'\n');
-
-thekeys=species.keys;
-for k=1:species.Count
- fprintf(fid,'%s',thekeys{k})
- temp(:,k)=species(thekeys{k});
-        for i=1:numits
-                fprintf(fid,'\t%d',temp(i,k));
-        end
-fprintf(fid,'\n');
-end
-fclose(fid);
-
-'Time to output OTU Table:'
-toc
-
-end
diff --git a/src/matlab/multifasta2otu/multifasta2otutable_rdp7.m b/src/matlab/multifasta2otu/multifasta2otutable_rdp7.m
deleted file mode 100644
index 9c83ea9..0000000
--- a/src/matlab/multifasta2otu/multifasta2otutable_rdp7.m
+++ /dev/null
@@ -1,212 +0,0 @@
-% Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013
-%This is an example of how to run Quikr on the default RDP_7 training set
-
-%make sure Matlab/Octave is in your path
-%cd /path/to/Quikr
-
-%User-defined variables
-input_directory='../../separated_samples'; %path to input directory of samples
-output_directory='quikr_results'; %path to where want output files to go
-otu_table_name='rdp_otu_table.txt'; %name of output otu_table filename
-%Do not have to define trainingdatabase file here
-[headers,~]=fastaread('../../../data/trainset7_112011.fa'); %read in the training database
-
-mkdir([output_directory])
-thedirs=dir([input_directory]);
-thetime=zeros(numel(thedirs)-1,1);
-names={};
-
-
-if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab
-
-%This is Octave Version
-
-
-species=struct();
-keys={};
-
-tic()
-
-i=0;
-%for numdirs=3:5
-for numdirs=3:numel(thedirs)
-i=i+1;
-disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
-fastafilename=[input_directory '/' thedirs(numdirs).name];
-[loadfasta,~]=fastaread(fastafilename);
-numreads=numel(loadfasta);
-xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
-
-nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
-proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
-namescell=headers(nonzeroentries); %Get the names of the sequences
-namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
-
-[a cols]=size(namesandproportions);
-amount=zeros(cols,1);
-for j=1:cols
-  names{j}=namesandproportions{1,j}(1:strfind(namesandproportions{1,j},' ')-1);
-  names{j}=strrep(names{j},'|','_');
-  amount(j)=namesandproportions{2,j};
-if strcmp(keys,names{j})
-	temp=species.(names{j});
-      	temp(i)=round(amount(j).*numreads);
-      	species.(names{j})=temp;
-  else
-      temp=zeros(numel(thedirs)-3+1,1);
-      temp(i)=round(amount(j).*numreads);
-      if temp(i)==0
-	% do not make a key -- has insignificant counts
-      else
-      species.(names{j})=temp;
-      keys{end+1}=names{j};
-      end
-  end
-end
-
-thefa=strfind(thedirs(numdirs).name,'.fa');
-
-if ~isempty(thedirs(numdirs).name(1:thefa-1))
-	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
-else
-	sampleid{i}='empty_sampleid';
-end
-thetime(i+1)=toc();
-thetime(i+1)
-
-end
-
-disp('Total time to compute Quikr:')
-toc()
-disp('Quikr Average time per file:')
-mean(diff(thetime(1:i+1)))
-
-tic();
-numits=i;
-
-
-fid=fopen([output_directory '/' otu_table_name],'w');
-fprintf(fid,'# QIIME vGail OTU table\n');
-fprintf(fid,'#OTU_ID\t');
-for i=1:numits
-if i<numits
-fprintf(fid,'%s\t',sampleid{i});
-else
-fprintf(fid,'%s',sampleid{i});
-end
-end
-fprintf(fid,'\n');
-
-for k=1:numel(keys)
-truncname=strrep(keys{k},'_','|');
-  fprintf(fid,'%s',truncname);
-
-  temp(:,k)=species.(keys{k});
-        for i=1:numits
-		fprintf(fid,'\t%d',temp(i,k));
-	end
-fprintf(fid,'\n');
-end
-fclose(fid);
-
-disp('Time to output OTU Table:')
-toc()
-
-else
-
-%This is the Matlab version
-
-
-species=containers.Map;
-
-tic()
-
-i=0;
-%for numdirs=3:6
-for numdirs=3:numel(thedirs)
-i=i+1;
-[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
-fastafilename=[input_directory '/' thedirs(numdirs).name];
-[loadfasta,~]=fastaread(fastafilename);
-numreads=numel(loadfasta);
-xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
-
-nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
-proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
-namescell=headers(nonzeroentries); %Get the names of the sequences
-namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
-
-[a cols]=size(namesandproportions);
-amount=zeros(cols,1);
-for j=1:cols
-  names{j}=namesandproportions{1,j};
-  amount(j)=namesandproportions{2,j};
-  if isKey(species,names{j})
-      temp=species(names{j});
-      temp(i)=round(amount(j).*numreads);
-      species(names{j})=temp;
-  else
-      temp=zeros(numel(thedirs)-3+1,1);
-      temp(i)=round(amount(j).*numreads);
-      if temp(i)==0
-	% insignificant counts -- do not put
-      else
-        species(names{j})=temp;
-      end
-  end
-end
-
-thefa=strfind(thedirs(numdirs).name,'.fa');
-
-if ~isempty(thedirs(numdirs).name(1:thefa-1))
-	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
-else
-	sampleid{i}='empty_sampleid';
-end
-thetime(i+1)=toc();
-thetime(i+1)
-
-end
-
-'Total time to compute Quikr:'
-toc()
-'Quikr Average time per file:'
-mean(diff(thetime(1:i+1)))
-
-tic
-numits=i;
-
-
-fid=fopen([output_directory '/' otu_table_name],'w');
-fprintf(fid,'# QIIME vGail OTU table\n');
-fprintf(fid,'#OTU_ID\t');
-for i=1:numits
-if i<numits
-fprintf(fid,'%s\t',sampleid{i});
-else
-fprintf(fid,'%s',sampleid{i});
-end
-end
-fprintf(fid,'\n');
-
-thekeys=species.keys;
-for k=1:species.Count
-if k==1
-  space=thekeys{k}(20);
-end
-delimit =strfind(thekeys{k},space);
-truncname=thekeys{k}(1:delimit-1);
-  fprintf(fid,'%s',truncname);
-
-  temp(:,k)=species(thekeys{k});
-        for i=1:numits
-		fprintf(fid,'\t%d',temp(i,k));
-	end
-fprintf(fid,'\n');
-end
-fclose(fid);
-
-'Time to output OTU Table:'
-toc
-
-end
diff --git a/src/matlab/multifasta2otutable_gg94_2011.m b/src/matlab/multifasta2otutable_gg94_2011.m
new file mode 100644
index 0000000..9078487
--- /dev/null
+++ b/src/matlab/multifasta2otutable_gg94_2011.m
@@ -0,0 +1,219 @@
+% Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013
+%This is an example of how to run Multifasta Quikr with a custom 
+%training database (in this case Greengenes OTU's within 94% identity)
+
+%make sure Matlab/Octave is in your path
+%cd /path/to/Quikr
+
+%User-defined variables
+input_directory='../separated_samples'; %path to input directory of samples
+output_directory='quikr_results'; %path to where want output files to go
+otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename
+trainingdatabasefilename='../../../data/gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database
+
+
+mkdir([output_directory]);
+thedirs=dir([input_directory]);
+thetime=zeros(numel(thedirs)-1,1);
+names={};
+
+if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab
+
+%This is Octave Version
+
+tic();
+k=6; %pick a k-mer size
+trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
+disp('Training time:')
+[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
+lambda=10000; 
+training_time=toc()
+
+species=struct();
+keys={};
+
+tic();
+
+
+i=0;
+%for numdirs=3:5
+for numdirs=3:numel(thedirs)
+i=i+1;
+disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
+fastafilename=[input_directory '/' thedirs(numdirs).name];
+[loadfasta,~]=fastaread(fastafilename);
+numreads=numel(loadfasta);
+xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
+
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
+
+[a cols]=size(namesandproportions);
+amount=zeros(cols,1);
+for j=1:cols
+  names{j}=['s' namesandproportions{1,j}];
+  amount(j)=namesandproportions{2,j};
+  if strcmp(keys,names{j})
+	temp=species.(names{j});
+      	temp(i)=round(amount(j).*numreads);
+      	species.(names{j})=temp;
+  else
+      temp=zeros(numel(thedirs)-3+1,1);
+      temp(i)=round(amount(j).*numreads);
+      if temp(i)==0
+         %insignificant counts, do nothing
+      else
+         species.(names{j})=temp;
+         keys{end+1}=names{j};
+      end
+  end
+end
+
+thefa=strfind(thedirs(numdirs).name,'.fa');
+
+if ~isempty(thedirs(numdirs).name(1:thefa-1))
+	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
+else
+	sampleid{i}='empty_sampleid';
+end
+
+thetime(i+1)=toc();
+thetime(i+1)
+
+end
+
+disp('Total time to compute Quikr:')
+toc()
+disp('Quikr Average time per file:')
+mean(diff(thetime(1:i+1)))
+
+tic()
+numits=i;
+
+fid=fopen([output_directory '/' otu_table_name],'w');
+fprintf(fid,'# QIIME vGail OTU table\n');
+fprintf(fid,'#OTU_ID\t');
+for i=1:numits
+if i<numits
+fprintf(fid,'%s\t',sampleid{i});
+else
+fprintf(fid,'%s',sampleid{i});
+end
+end
+fprintf(fid,'\n');
+
+for k=1:numel(keys)
+ fprintf(fid,'%s',keys{k}(2:end))
+ temp(:,k)=species.(keys{k});
+        for i=1:numits
+                fprintf(fid,'\t%d',temp(i,k));
+        end
+fprintf(fid,'\n');
+end
+fclose(fid);
+
+disp('Time to output OTU Table:')
+toc()
+
+else
+
+%This is Matlab Version
+
+tic()
+k=6; %pick a k-mer size
+trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
+'Training time:'
+[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
+lambda=10000; 
+training_time=toc()
+
+species=containers.Map;
+
+tic()
+
+
+i=0;
+%for numdirs=3:5
+for numdirs=3:numel(thedirs)
+i=i+1;
+[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
+fastafilename=[input_directory '/' thedirs(numdirs).name];
+[loadfasta,~]=fastaread(fastafilename);
+numreads=numel(loadfasta);
+xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
+
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
+
+[a cols]=size(namesandproportions);
+amount=zeros(cols,1);
+for j=1:cols
+  names{j}=namesandproportions{1,j};
+  amount(j)=namesandproportions{2,j};
+  if isKey(species,names{j})
+	temp=species(names{j});
+      	temp(i)=round(amount(j).*numreads);
+      	species(names{j})=temp;
+  else
+      temp=zeros(numel(thedirs)-3+1,1);
+      temp(i)=round(amount(j).*numreads);
+      if temp(i)==0
+	%insigficant counts, do nothing
+      else
+	species(names{j})=temp;
+      end
+  end
+end
+
+thefa=strfind(thedirs(numdirs).name,'.fa');
+
+if ~isempty(thedirs(numdirs).name(1:thefa-1))
+	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
+else
+	sampleid{i}='empty_sampleid';
+end
+
+thetime(i+1)=toc();
+thetime(i+1)
+
+end
+
+'Total time to compute Quikr:'
+toc()
+'Quikr Average time per file:'
+mean(diff(thetime(1:i+1)))
+
+tic
+numits=i;
+
+fid=fopen([output_directory '/' otu_table_name],'w');
+fprintf(fid,'# QIIME vGail OTU table\n');
+fprintf(fid,'#OTU_ID\t');
+for i=1:numits
+if i<numits
+fprintf(fid,'%s\t',sampleid{i});
+else
+fprintf(fid,'%s',sampleid{i});
+end
+end
+fprintf(fid,'\n');
+
+thekeys=species.keys;
+for k=1:species.Count
+ fprintf(fid,'%s',thekeys{k})
+ temp(:,k)=species(thekeys{k});
+        for i=1:numits
+                fprintf(fid,'\t%d',temp(i,k));
+        end
+fprintf(fid,'\n');
+end
+fclose(fid);
+
+'Time to output OTU Table:'
+toc
+
+end
diff --git a/src/matlab/multifasta2otutable_rdp7.m b/src/matlab/multifasta2otutable_rdp7.m
new file mode 100644
index 0000000..9c83ea9
--- /dev/null
+++ b/src/matlab/multifasta2otutable_rdp7.m
@@ -0,0 +1,212 @@
+% Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013
+%This is an example of how to run Quikr on the default RDP_7 training set
+
+%make sure Matlab/Octave is in your path
+%cd /path/to/Quikr
+
+%User-defined variables
+input_directory='../../separated_samples'; %path to input directory of samples
+output_directory='quikr_results'; %path to where want output files to go
+otu_table_name='rdp_otu_table.txt'; %name of output otu_table filename
+%Do not have to define trainingdatabase file here
+[headers,~]=fastaread('../../../data/trainset7_112011.fa'); %read in the training database
+
+mkdir([output_directory])
+thedirs=dir([input_directory]);
+thetime=zeros(numel(thedirs)-1,1);
+names={};
+
+
+if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab
+
+%This is Octave Version
+
+
+species=struct();
+keys={};
+
+tic()
+
+i=0;
+%for numdirs=3:5
+for numdirs=3:numel(thedirs)
+i=i+1;
+disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
+fastafilename=[input_directory '/' thedirs(numdirs).name];
+[loadfasta,~]=fastaread(fastafilename);
+numreads=numel(loadfasta);
+xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
+
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
+
+[a cols]=size(namesandproportions);
+amount=zeros(cols,1);
+for j=1:cols
+  names{j}=namesandproportions{1,j}(1:strfind(namesandproportions{1,j},' ')-1);
+  names{j}=strrep(names{j},'|','_');
+  amount(j)=namesandproportions{2,j};
+if strcmp(keys,names{j})
+	temp=species.(names{j});
+      	temp(i)=round(amount(j).*numreads);
+      	species.(names{j})=temp;
+  else
+      temp=zeros(numel(thedirs)-3+1,1);
+      temp(i)=round(amount(j).*numreads);
+      if temp(i)==0
+	% do not make a key -- has insignificant counts
+      else
+      species.(names{j})=temp;
+      keys{end+1}=names{j};
+      end
+  end
+end
+
+thefa=strfind(thedirs(numdirs).name,'.fa');
+
+if ~isempty(thedirs(numdirs).name(1:thefa-1))
+	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
+else
+	sampleid{i}='empty_sampleid';
+end
+thetime(i+1)=toc();
+thetime(i+1)
+
+end
+
+disp('Total time to compute Quikr:')
+toc()
+disp('Quikr Average time per file:')
+mean(diff(thetime(1:i+1)))
+
+tic();
+numits=i;
+
+
+fid=fopen([output_directory '/' otu_table_name],'w');
+fprintf(fid,'# QIIME vGail OTU table\n');
+fprintf(fid,'#OTU_ID\t');
+for i=1:numits
+if i<numits
+fprintf(fid,'%s\t',sampleid{i});
+else
+fprintf(fid,'%s',sampleid{i});
+end
+end
+fprintf(fid,'\n');
+
+for k=1:numel(keys)
+truncname=strrep(keys{k},'_','|');
+  fprintf(fid,'%s',truncname);
+
+  temp(:,k)=species.(keys{k});
+        for i=1:numits
+		fprintf(fid,'\t%d',temp(i,k));
+	end
+fprintf(fid,'\n');
+end
+fclose(fid);
+
+disp('Time to output OTU Table:')
+toc()
+
+else
+
+%This is the Matlab version
+
+
+species=containers.Map;
+
+tic()
+
+i=0;
+%for numdirs=3:6
+for numdirs=3:numel(thedirs)
+i=i+1;
+[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
+fastafilename=[input_directory '/' thedirs(numdirs).name];
+[loadfasta,~]=fastaread(fastafilename);
+numreads=numel(loadfasta);
+xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
+
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
+
+[a cols]=size(namesandproportions);
+amount=zeros(cols,1);
+for j=1:cols
+  names{j}=namesandproportions{1,j};
+  amount(j)=namesandproportions{2,j};
+  if isKey(species,names{j})
+      temp=species(names{j});
+      temp(i)=round(amount(j).*numreads);
+      species(names{j})=temp;
+  else
+      temp=zeros(numel(thedirs)-3+1,1);
+      temp(i)=round(amount(j).*numreads);
+      if temp(i)==0
+	% insignificant counts -- do not put
+      else
+        species(names{j})=temp;
+      end
+  end
+end
+
+thefa=strfind(thedirs(numdirs).name,'.fa');
+
+if ~isempty(thedirs(numdirs).name(1:thefa-1))
+	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
+else
+	sampleid{i}='empty_sampleid';
+end
+thetime(i+1)=toc();
+thetime(i+1)
+
+end
+
+'Total time to compute Quikr:'
+toc()
+'Quikr Average time per file:'
+mean(diff(thetime(1:i+1)))
+
+tic
+numits=i;
+
+
+fid=fopen([output_directory '/' otu_table_name],'w');
+fprintf(fid,'# QIIME vGail OTU table\n');
+fprintf(fid,'#OTU_ID\t');
+for i=1:numits
+if i<numits
+fprintf(fid,'%s\t',sampleid{i});
+else
+fprintf(fid,'%s',sampleid{i});
+end
+end
+fprintf(fid,'\n');
+
+thekeys=species.keys;
+for k=1:species.Count
+if k==1
+  space=thekeys{k}(20);
+end
+delimit =strfind(thekeys{k},space);
+truncname=thekeys{k}(1:delimit-1);
+  fprintf(fid,'%s',truncname);
+
+  temp(:,k)=species(thekeys{k});
+        for i=1:numits
+		fprintf(fid,'\t%d',temp(i,k));
+	end
+fprintf(fid,'\n');
+end
+fclose(fid);
+
+'Time to output OTU Table:'
+toc
+
+end
-- 
cgit v1.2.3