From dd20b8e4efb4a6c5090d76459f3fdb0885367477 Mon Sep 17 00:00:00 2001 From: Calvin Date: Fri, 3 May 2013 16:56:11 -0400 Subject: updated documentation , added multi_gg_final.m --- doc/cli.markdown | 2 +- doc/matlab.markdown | 45 ++++++- src/matlab/multifasta2otu/README | 22 +++ src/matlab/multifasta2otu/multi_gg_final.m | 210 +++++++++++++++++++++++++++++ 4 files changed, 273 insertions(+), 6 deletions(-) create mode 100644 src/matlab/multifasta2otu/README create mode 100644 src/matlab/multifasta2otu/multi_gg_final.m diff --git a/doc/cli.markdown b/doc/cli.markdown index 87df41b..d4a1bb9 100644 --- a/doc/cli.markdown +++ b/doc/cli.markdown @@ -8,7 +8,7 @@ module. ## Quikr\_train ## The quikr\_train is a tool to train a database for use with the quikr tool. -Before running the quikr utility, you need to generate the trained matrix or +Before running the quikr utility, you need to generate the sensing matrix or download a pretrained matrix from our database\_download.html. ### Usage ### diff --git a/doc/matlab.markdown b/doc/matlab.markdown index ca701b9..1e990d9 100644 --- a/doc/matlab.markdown +++ b/doc/matlab.markdown @@ -1,5 +1,4 @@ # Quikr's Matlab Implementation # - The Quikr implementation works in Matlab and also works in Octave, but the Octave version will run much slower @@ -9,16 +8,15 @@ make sure that you are in the quikr's matlab directory (src/matlab/): cd quikr/src/matlab - ### Using Quikr with the default databse ### This is the full path name to your data file: fastafilename='/path/to/quikr-code/testfastafile.fa'; This will give the predicted reconstruction frequencies using the default -training database trainset7\_112011.fa from RDP version 2.4 -Xstar will be on the same basis as trainset7\_112011.fa, so to get the sequences -that are predicted to be present in your sample: +training database trainset7\_112011.fa from RDP version 2.4 Xstar will be on the +same basis as trainset7\_112011.fa, so to get the sequences that are predicted +to be present in your sample: xstar=quikr(fastafilename); @@ -110,3 +108,40 @@ This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively) namesandproportions={namescell{:}; proportionscell{:}}; + +### Using Multifasta2otu.m ### + +Usage tips: +* Please name fasta files of sample reads with .fa<*> and place them + into one directory without any other f ile in that directory (for example, no + hidden files that the operating system may generate, are allowed in that + direct ory) +* Note: When making your QIIME Metadata file, the sample id's must match the + fasta file prefix names +* Fasta files of reads must have a suffix that starts with .fa (e.g.: .fasta and + .fa are valid while .fna is NOT) +* Modify the top of the Matlab/Octave scripts for , + , , and + +To use with QIIME, one must run the QIIME conversion tool on our OTU table +output: + + convert_biom.py -i -o .biom + --biom_table_type="otu table" + + +4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs: +(Note: Our code works much better with WEIGHTED Unifrac as opposed to +Unweighted.) + +Pre-requisites: +1. +2. the tree of the database sequences that were used (e.g. dp7\_mafft.fasttree, + gg\_94\_otus\_4feb2011.tre, etc.) +3. your-defined + +The QIIME procedue: + convert_biom.py -i -o .biom --biom_table_type="otu table" + beta_diversity.py -i .biom -m weighted_unifrac -o beta_div -t (example: rdp7_mafft.fasttree)> + principal_coordinates.py -i beta_div/weighted_unifrac_.txt -o _weighted.txt + make_3d_plots.py -i _weighted.txt -o <3d_pcoa_plotdirectory> -m diff --git a/src/matlab/multifasta2otu/README b/src/matlab/multifasta2otu/README new file mode 100644 index 0000000..40031e6 --- /dev/null +++ b/src/matlab/multifasta2otu/README @@ -0,0 +1,22 @@ +* Please name fasta files of sample reads with .fa<*> and place them into one directory without any other file in that directory (for example, no hidden files that the operating system may generate, are allowed in that directory) +* Note: When making your QIIME Metadata file, the sample id's must match the fasta file prefix names +* Fasta files of reads must have a suffix that starts with .fa (e.g.: .fasta and .fa are valid while .fna is NOT) +* Modify the top of the Matlab/Octave scripts for , , , and + +To use with QIIME, one must run the QIIME conversion tool on our OTU table output: +convert_biom.py -i -o .biom --biom_table_type="otu table" + +--------------------------- + +4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs: +(Note: Our code works much better with WEIGHTED Unifrac as opposed to +Unweighted.) + +Pre-requisites: 1) , 2) the tree of the database sequences that were used (e.g. +rdp7_mafft.fasttree, gg_94_otus_4feb2011.tre, etc.), and 3) your-defined + +1. convert_biom.py -i -o .biom --biom_table_type="otu table" +2. beta_diversity.py -i .biom -m weighted_unifrac -o beta_div -t +3. principal_coordinates.py -i beta_div/weighted_unifrac_.txt -o _weighted.txt +4. make_3d_plots.py -i _weighted.txt -o <3d_pcoa_plotdirectory> -m + diff --git a/src/matlab/multifasta2otu/multi_gg_final.m b/src/matlab/multifasta2otu/multi_gg_final.m new file mode 100644 index 0000000..9b66274 --- /dev/null +++ b/src/matlab/multifasta2otu/multi_gg_final.m @@ -0,0 +1,210 @@ +%This is an example of how to run Multifasta Quikr with a custom +%training database (in this case Greengenes OTU's within 94% identity) + +%make sure Matlab/Octave is in your path +%cd /path/to/Quikr + +%User-defined variables +input_directory='../separated_samples'; %path to input directory of samples +output_directory='quikr_results'; %path to where want output files to go +otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename +trainingdatabasefilename='gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database + + +mkdir([output_directory]); +thedirs=dir([input_directory]); +thetime=zeros(numel(thedirs)-1,1); +names={}; + +if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab + +%This is Octave Version + +tic(); +k=6; %pick a k-mer size +trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database +disp('Training time:') +[headers,~]=fastaread(trainingdatabasefilename); %read in the training database +lambda=10000; +training_time=toc() + +species=struct(); +keys={}; + +tic(); + + +i=0; +%for numdirs=3:5 +for numdirs=3:numel(thedirs) +i=i+1; +disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)]) +fastafilename=[input_directory '/' thedirs(numdirs).name]; +[loadfasta,~]=fastaread(fastafilename); +numreads=numel(loadfasta); +xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda); + +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array +namescell=headers(nonzeroentries); %Get the names of the sequences +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively) + +[a cols]=size(namesandproportions); +amount=zeros(cols,1); +for j=1:cols + names{j}=['s' namesandproportions{1,j}]; + amount(j)=namesandproportions{2,j}; + if strcmp(keys,names{j}) + temp=species.(names{j}); + temp(i)=round(amount(j).*numreads); + species.(names{j})=temp; + else + temp=zeros(numel(thedirs)-3+1,1); + temp(i)=round(amount(j).*numreads); + species.(names{j})=temp; + keys{end+1}=names{j}; + end +end + +thefa=strfind(thedirs(numdirs).name,'.fa'); + +if ~isempty(thedirs(numdirs).name(1:thefa-1)) + sampleid{i}=thedirs(numdirs).name(1:thefa-1); +else + sampleid{i}='empty_sampleid'; +end + +thetime(i+1)=toc(); +thetime(i+1) + +end + +disp('Total time to compute Quikr:') +toc() +disp('Quickr Average time per file:') +mean(diff(thetime(1:i+1))) + +tic() +numits=i; + +fid=fopen([output_directory '/' otu_table_name],'w'); +fprintf(fid,'# QIIME vGail OTU table\n'); +fprintf(fid,'#OTU_ID\t'); +for i=1:numits +if i