summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/cli.markdown2
-rw-r--r--doc/matlab.markdown45
-rw-r--r--src/matlab/multifasta2otu/README22
-rw-r--r--src/matlab/multifasta2otu/multi_gg_final.m210
4 files changed, 273 insertions, 6 deletions
diff --git a/doc/cli.markdown b/doc/cli.markdown
index 87df41b..d4a1bb9 100644
--- a/doc/cli.markdown
+++ b/doc/cli.markdown
@@ -8,7 +8,7 @@ module.
## Quikr\_train ##
The quikr\_train is a tool to train a database for use with the quikr tool.
-Before running the quikr utility, you need to generate the trained matrix or
+Before running the quikr utility, you need to generate the sensing matrix or
download a pretrained matrix from our database\_download.html.
### Usage ###
diff --git a/doc/matlab.markdown b/doc/matlab.markdown
index ca701b9..1e990d9 100644
--- a/doc/matlab.markdown
+++ b/doc/matlab.markdown
@@ -1,5 +1,4 @@
# Quikr's Matlab Implementation #
-
The Quikr implementation works in Matlab and also works in Octave, but the
Octave version will run much slower
@@ -9,16 +8,15 @@ make sure that you are in the quikr's matlab directory (src/matlab/):
cd quikr/src/matlab
-
### Using Quikr with the default databse ###
This is the full path name to your data file:
fastafilename='/path/to/quikr-code/testfastafile.fa';
This will give the predicted reconstruction frequencies using the default
-training database trainset7\_112011.fa from RDP version 2.4
-Xstar will be on the same basis as trainset7\_112011.fa, so to get the sequences
-that are predicted to be present in your sample:
+training database trainset7\_112011.fa from RDP version 2.4 Xstar will be on the
+same basis as trainset7\_112011.fa, so to get the sequences that are predicted
+to be present in your sample:
xstar=quikr(fastafilename);
@@ -110,3 +108,40 @@ This cell array contains the (unsorted) names of the reconstructed sequences and
their concentrations (in the first and second columns respectively)
namesandproportions={namescell{:}; proportionscell{:}};
+
+### Using Multifasta2otu.m ###
+
+Usage tips:
+* Please name fasta files of sample reads with <sample id>.fa<*> and place them
+ into one directory without any other f ile in that directory (for example, no
+ hidden files that the operating system may generate, are allowed in that
+ direct ory)
+* Note: When making your QIIME Metadata file, the sample id's must match the
+ fasta file prefix names
+* Fasta files of reads must have a suffix that starts with .fa (e.g.: .fasta and
+ .fa are valid while .fna is NOT)
+* Modify the top of the Matlab/Octave scripts for <input_directory>,
+ <output_directory>, <output_filename>, and <train ing_database_filename>
+
+To use with QIIME, one must run the QIIME conversion tool on our OTU table
+output:
+
+ convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom
+ --biom_table_type="otu table"
+
+
+4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs:
+(Note: Our code works much better with WEIGHTED Unifrac as opposed to
+Unweighted.)
+
+Pre-requisites:
+1. <quikr_otu_table.txt>
+2. the tree of the database sequences that were used (e.g. dp7\_mafft.fasttree,
+ gg\_94\_otus\_4feb2011.tre, etc.)
+3. your-defined <qiime_metadata_file.txt>
+
+The QIIME procedue:
+ convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
+ beta_diversity.py -i <quikr_otu>.biom -m weighted_unifrac -o beta_div -t <tree file> (example: rdp7_mafft.fasttree)>
+ principal_coordinates.py -i beta_div/weighted_unifrac_<quikr_otu>.txt -o <quikr_otu_project_name>_weighted.txt
+ make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirectory> -m <qiime_metadata_file>
diff --git a/src/matlab/multifasta2otu/README b/src/matlab/multifasta2otu/README
new file mode 100644
index 0000000..40031e6
--- /dev/null
+++ b/src/matlab/multifasta2otu/README
@@ -0,0 +1,22 @@
+* Please name fasta files of sample reads with <sample id>.fa<*> and place them into one directory without any other file in that directory (for example, no hidden files that the operating system may generate, are allowed in that directory)
+* Note: When making your QIIME Metadata file, the sample id's must match the fasta file prefix names
+* Fasta files of reads must have a suffix that starts with .fa (e.g.: .fasta and .fa are valid while .fna is NOT)
+* Modify the top of the Matlab/Octave scripts for <input_directory>, <output_directory>, <output_filename>, and <training_database_filename>
+
+To use with QIIME, one must run the QIIME conversion tool on our OTU table output:
+convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
+
+---------------------------
+
+4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs:
+(Note: Our code works much better with WEIGHTED Unifrac as opposed to
+Unweighted.)
+
+Pre-requisites: 1) <quikr_otu_table.txt>, 2) the tree of the database sequences that were used (e.g.
+rdp7_mafft.fasttree, gg_94_otus_4feb2011.tre, etc.), and 3) your-defined <qiime_metadata_file.txt>
+
+1. convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
+2. beta_diversity.py -i <quikr_otu>.biom -m weighted_unifrac -o beta_div -t <tree file (example: rdp7_mafft.fasttree)>
+3. principal_coordinates.py -i beta_div/weighted_unifrac_<quikr_otu>.txt -o <quikr_otu_project_name>_weighted.txt
+4. make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirectory> -m <qiime_metadata_file>
+
diff --git a/src/matlab/multifasta2otu/multi_gg_final.m b/src/matlab/multifasta2otu/multi_gg_final.m
new file mode 100644
index 0000000..9b66274
--- /dev/null
+++ b/src/matlab/multifasta2otu/multi_gg_final.m
@@ -0,0 +1,210 @@
+%This is an example of how to run Multifasta Quikr with a custom
+%training database (in this case Greengenes OTU's within 94% identity)
+
+%make sure Matlab/Octave is in your path
+%cd /path/to/Quikr
+
+%User-defined variables
+input_directory='../separated_samples'; %path to input directory of samples
+output_directory='quikr_results'; %path to where want output files to go
+otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename
+trainingdatabasefilename='gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database
+
+
+mkdir([output_directory]);
+thedirs=dir([input_directory]);
+thetime=zeros(numel(thedirs)-1,1);
+names={};
+
+if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab
+
+%This is Octave Version
+
+tic();
+k=6; %pick a k-mer size
+trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
+disp('Training time:')
+[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
+lambda=10000;
+training_time=toc()
+
+species=struct();
+keys={};
+
+tic();
+
+
+i=0;
+%for numdirs=3:5
+for numdirs=3:numel(thedirs)
+i=i+1;
+disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
+fastafilename=[input_directory '/' thedirs(numdirs).name];
+[loadfasta,~]=fastaread(fastafilename);
+numreads=numel(loadfasta);
+xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
+
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
+
+[a cols]=size(namesandproportions);
+amount=zeros(cols,1);
+for j=1:cols
+ names{j}=['s' namesandproportions{1,j}];
+ amount(j)=namesandproportions{2,j};
+ if strcmp(keys,names{j})
+ temp=species.(names{j});
+ temp(i)=round(amount(j).*numreads);
+ species.(names{j})=temp;
+ else
+ temp=zeros(numel(thedirs)-3+1,1);
+ temp(i)=round(amount(j).*numreads);
+ species.(names{j})=temp;
+ keys{end+1}=names{j};
+ end
+end
+
+thefa=strfind(thedirs(numdirs).name,'.fa');
+
+if ~isempty(thedirs(numdirs).name(1:thefa-1))
+ sampleid{i}=thedirs(numdirs).name(1:thefa-1);
+else
+ sampleid{i}='empty_sampleid';
+end
+
+thetime(i+1)=toc();
+thetime(i+1)
+
+end
+
+disp('Total time to compute Quikr:')
+toc()
+disp('Quickr Average time per file:')
+mean(diff(thetime(1:i+1)))
+
+tic()
+numits=i;
+
+fid=fopen([output_directory '/' otu_table_name],'w');
+fprintf(fid,'# QIIME vGail OTU table\n');
+fprintf(fid,'#OTU_ID\t');
+for i=1:numits
+if i<numits
+fprintf(fid,'%s\t',sampleid{i});
+else
+fprintf(fid,'%s',sampleid{i});
+end
+end
+fprintf(fid,'\n');
+
+for k=1:numel(keys)
+ fprintf(fid,'%s',keys{k}(2:end))
+ temp(:,k)=species.(keys{k});
+ for i=1:numits
+ fprintf(fid,'\t%d',temp(i,k));
+ end
+fprintf(fid,'\n');
+end
+fclose(fid);
+
+disp('Time to output OTU Table:')
+toc()
+
+else
+
+%This is Matlab Version
+
+tic()
+k=6; %pick a k-mer size
+trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
+'Training time:'
+[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
+lambda=10000;
+training_time=toc()
+
+species=containers.Map;
+
+tic()
+
+
+i=0;
+%for numdirs=3:5
+for numdirs=3:numel(thedirs)
+i=i+1;
+[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
+fastafilename=[input_directory '/' thedirs(numdirs).name];
+[loadfasta,~]=fastaread(fastafilename);
+numreads=numel(loadfasta);
+xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
+
+nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
+proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
+namescell=headers(nonzeroentries); %Get the names of the sequences
+namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
+
+[a cols]=size(namesandproportions);
+amount=zeros(cols,1);
+for j=1:cols
+ names{j}=namesandproportions{1,j};
+ amount(j)=namesandproportions{2,j};
+ if isKey(species,names{j})
+ temp=species(names{j});
+ temp(i)=round(amount(j).*numreads);
+ species(names{j})=temp;
+ else
+ temp=zeros(numel(thedirs)-3+1,1);
+ temp(i)=round(amount(j).*numreads);
+ species(names{j})=temp;
+ end
+end
+
+thefa=strfind(thedirs(numdirs).name,'.fa');
+
+if ~isempty(thedirs(numdirs).name(1:thefa-1))
+ sampleid{i}=thedirs(numdirs).name(1:thefa-1);
+else
+ sampleid{i}='empty_sampleid';
+end
+
+thetime(i+1)=toc();
+thetime(i+1)
+
+end
+
+'Total time to compute Quikr:'
+toc()
+'Quickr Average time per file:'
+mean(diff(thetime(1:i+1)))
+
+tic
+numits=i;
+
+fid=fopen([output_directory '/' otu_table_name],'w');
+fprintf(fid,'# QIIME vGail OTU table\n');
+fprintf(fid,'#OTU_ID\t');
+for i=1:numits
+if i<numits
+fprintf(fid,'%s\t',sampleid{i});
+else
+fprintf(fid,'%s',sampleid{i});
+end
+end
+fprintf(fid,'\n');
+
+thekeys=species.keys;
+for k=1:species.Count
+ fprintf(fid,'%s',thekeys{k})
+ temp(:,k)=species(thekeys{k});
+ for i=1:numits
+ fprintf(fid,'\t%d',temp(i,k));
+ end
+fprintf(fid,'\n');
+end
+fclose(fid);
+
+'Time to output OTU Table:'
+toc
+
+end