From 60ee644d6f1add6722eb8444385f203c6110975a Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 12 Jun 2013 09:10:25 -0400 Subject: Matlab: move multifasta2otu into the main quikr folder --- src/matlab/README_for_multifasta2otu | 27 +++ src/matlab/multifasta2otu/README | 27 --- .../multifasta2otu/multifasta2otutable_gg94_2011.m | 219 --------------------- .../multifasta2otu/multifasta2otutable_rdp7.m | 212 -------------------- src/matlab/multifasta2otutable_gg94_2011.m | 219 +++++++++++++++++++++ src/matlab/multifasta2otutable_rdp7.m | 212 ++++++++++++++++++++ 6 files changed, 458 insertions(+), 458 deletions(-) create mode 100644 src/matlab/README_for_multifasta2otu delete mode 100644 src/matlab/multifasta2otu/README delete mode 100644 src/matlab/multifasta2otu/multifasta2otutable_gg94_2011.m delete mode 100644 src/matlab/multifasta2otu/multifasta2otutable_rdp7.m create mode 100644 src/matlab/multifasta2otutable_gg94_2011.m create mode 100644 src/matlab/multifasta2otutable_rdp7.m (limited to 'src/matlab') diff --git a/src/matlab/README_for_multifasta2otu b/src/matlab/README_for_multifasta2otu new file mode 100644 index 0000000..05a26c6 --- /dev/null +++ b/src/matlab/README_for_multifasta2otu @@ -0,0 +1,27 @@ +* Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013 + +Usage tips: +* Please name fasta files of sample reads with .fa<*> and place them into one directory without any other file in that directory (for example, no hidden files that the operating system may generate, are allowed in that directory) +* Note: When making your QIIME Metadata file, the sample id's must match the fasta file prefix names +* Fasta files of reads must have a suffix that starts with .fa (e.g.: .fasta and .fa are valid while .fna is NOT) +* Modify the top of the Matlab/Octave scripts for , , , and + +To use with QIIME, one must run the QIIME conversion tool on our OTU table output: +convert_biom.py -i -o .biom --biom_table_type="otu table" + +--------------------------- + +4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs: +(Note: Our code works much better with WEIGHTED Unifrac as opposed to +Unweighted.) + +Pre-requisites: +1. +2. the tree of the database sequences that were used (e.g. rdp7_mafft.fasttree, gg_94_otus_4feb2011.tre. they are in the data directory) +3. your-defined + +1. convert_biom.py -i -o .biom --biom_table_type="otu table" +2. beta_diversity.py -i .biom -m weighted_unifrac -o beta_div -t +3. principal_coordinates.py -i beta_div/weighted_unifrac_.txt -o _weighted.txt +4. make_3d_plots.py -i _weighted.txt -o <3d_pcoa_plotdirectory> -m + diff --git a/src/matlab/multifasta2otu/README b/src/matlab/multifasta2otu/README deleted file mode 100644 index 05a26c6..0000000 --- a/src/matlab/multifasta2otu/README +++ /dev/null @@ -1,27 +0,0 @@ -* Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013 - -Usage tips: -* Please name fasta files of sample reads with .fa<*> and place them into one directory without any other file in that directory (for example, no hidden files that the operating system may generate, are allowed in that directory) -* Note: When making your QIIME Metadata file, the sample id's must match the fasta file prefix names -* Fasta files of reads must have a suffix that starts with .fa (e.g.: .fasta and .fa are valid while .fna is NOT) -* Modify the top of the Matlab/Octave scripts for , , , and - -To use with QIIME, one must run the QIIME conversion tool on our OTU table output: -convert_biom.py -i -o .biom --biom_table_type="otu table" - ---------------------------- - -4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs: -(Note: Our code works much better with WEIGHTED Unifrac as opposed to -Unweighted.) - -Pre-requisites: -1. -2. the tree of the database sequences that were used (e.g. rdp7_mafft.fasttree, gg_94_otus_4feb2011.tre. they are in the data directory) -3. your-defined - -1. convert_biom.py -i -o .biom --biom_table_type="otu table" -2. beta_diversity.py -i .biom -m weighted_unifrac -o beta_div -t -3. principal_coordinates.py -i beta_div/weighted_unifrac_.txt -o _weighted.txt -4. make_3d_plots.py -i _weighted.txt -o <3d_pcoa_plotdirectory> -m - diff --git a/src/matlab/multifasta2otu/multifasta2otutable_gg94_2011.m b/src/matlab/multifasta2otu/multifasta2otutable_gg94_2011.m deleted file mode 100644 index 9078487..0000000 --- a/src/matlab/multifasta2otu/multifasta2otutable_gg94_2011.m +++ /dev/null @@ -1,219 +0,0 @@ -% Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013 -%This is an example of how to run Multifasta Quikr with a custom -%training database (in this case Greengenes OTU's within 94% identity) - -%make sure Matlab/Octave is in your path -%cd /path/to/Quikr - -%User-defined variables -input_directory='../separated_samples'; %path to input directory of samples -output_directory='quikr_results'; %path to where want output files to go -otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename -trainingdatabasefilename='../../../data/gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database - - -mkdir([output_directory]); -thedirs=dir([input_directory]); -thetime=zeros(numel(thedirs)-1,1); -names={}; - -if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab - -%This is Octave Version - -tic(); -k=6; %pick a k-mer size -trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database -disp('Training time:') -[headers,~]=fastaread(trainingdatabasefilename); %read in the training database -lambda=10000; -training_time=toc() - -species=struct(); -keys={}; - -tic(); - - -i=0; -%for numdirs=3:5 -for numdirs=3:numel(thedirs) -i=i+1; -disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)]) -fastafilename=[input_directory '/' thedirs(numdirs).name]; -[loadfasta,~]=fastaread(fastafilename); -numreads=numel(loadfasta); -xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda); - -nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample -proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array -namescell=headers(nonzeroentries); %Get the names of the sequences -namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively) - -[a cols]=size(namesandproportions); -amount=zeros(cols,1); -for j=1:cols - names{j}=['s' namesandproportions{1,j}]; - amount(j)=namesandproportions{2,j}; - if strcmp(keys,names{j}) - temp=species.(names{j}); - temp(i)=round(amount(j).*numreads); - species.(names{j})=temp; - else - temp=zeros(numel(thedirs)-3+1,1); - temp(i)=round(amount(j).*numreads); - if temp(i)==0 - %insignificant counts, do nothing - else - species.(names{j})=temp; - keys{end+1}=names{j}; - end - end -end - -thefa=strfind(thedirs(numdirs).name,'.fa'); - -if ~isempty(thedirs(numdirs).name(1:thefa-1)) - sampleid{i}=thedirs(numdirs).name(1:thefa-1); -else - sampleid{i}='empty_sampleid'; -end - -thetime(i+1)=toc(); -thetime(i+1) - -end - -disp('Total time to compute Quikr:') -toc() -disp('Quikr Average time per file:') -mean(diff(thetime(1:i+1))) - -tic() -numits=i; - -fid=fopen([output_directory '/' otu_table_name],'w'); -fprintf(fid,'# QIIME vGail OTU table\n'); -fprintf(fid,'#OTU_ID\t'); -for i=1:numits -if iotu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013 -%This is an example of how to run Quikr on the default RDP_7 training set - -%make sure Matlab/Octave is in your path -%cd /path/to/Quikr - -%User-defined variables -input_directory='../../separated_samples'; %path to input directory of samples -output_directory='quikr_results'; %path to where want output files to go -otu_table_name='rdp_otu_table.txt'; %name of output otu_table filename -%Do not have to define trainingdatabase file here -[headers,~]=fastaread('../../../data/trainset7_112011.fa'); %read in the training database - -mkdir([output_directory]) -thedirs=dir([input_directory]); -thetime=zeros(numel(thedirs)-1,1); -names={}; - - -if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab - -%This is Octave Version - - -species=struct(); -keys={}; - -tic() - -i=0; -%for numdirs=3:5 -for numdirs=3:numel(thedirs) -i=i+1; -disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)]) -fastafilename=[input_directory '/' thedirs(numdirs).name]; -[loadfasta,~]=fastaread(fastafilename); -numreads=numel(loadfasta); -xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies - -nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample -proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array -namescell=headers(nonzeroentries); %Get the names of the sequences -namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively) - -[a cols]=size(namesandproportions); -amount=zeros(cols,1); -for j=1:cols - names{j}=namesandproportions{1,j}(1:strfind(namesandproportions{1,j},' ')-1); - names{j}=strrep(names{j},'|','_'); - amount(j)=namesandproportions{2,j}; -if strcmp(keys,names{j}) - temp=species.(names{j}); - temp(i)=round(amount(j).*numreads); - species.(names{j})=temp; - else - temp=zeros(numel(thedirs)-3+1,1); - temp(i)=round(amount(j).*numreads); - if temp(i)==0 - % do not make a key -- has insignificant counts - else - species.(names{j})=temp; - keys{end+1}=names{j}; - end - end -end - -thefa=strfind(thedirs(numdirs).name,'.fa'); - -if ~isempty(thedirs(numdirs).name(1:thefa-1)) - sampleid{i}=thedirs(numdirs).name(1:thefa-1); -else - sampleid{i}='empty_sampleid'; -end -thetime(i+1)=toc(); -thetime(i+1) - -end - -disp('Total time to compute Quikr:') -toc() -disp('Quikr Average time per file:') -mean(diff(thetime(1:i+1))) - -tic(); -numits=i; - - -fid=fopen([output_directory '/' otu_table_name],'w'); -fprintf(fid,'# QIIME vGail OTU table\n'); -fprintf(fid,'#OTU_ID\t'); -for i=1:numits -if iotu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013 +%This is an example of how to run Multifasta Quikr with a custom +%training database (in this case Greengenes OTU's within 94% identity) + +%make sure Matlab/Octave is in your path +%cd /path/to/Quikr + +%User-defined variables +input_directory='../separated_samples'; %path to input directory of samples +output_directory='quikr_results'; %path to where want output files to go +otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename +trainingdatabasefilename='../../../data/gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database + + +mkdir([output_directory]); +thedirs=dir([input_directory]); +thetime=zeros(numel(thedirs)-1,1); +names={}; + +if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab + +%This is Octave Version + +tic(); +k=6; %pick a k-mer size +trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database +disp('Training time:') +[headers,~]=fastaread(trainingdatabasefilename); %read in the training database +lambda=10000; +training_time=toc() + +species=struct(); +keys={}; + +tic(); + + +i=0; +%for numdirs=3:5 +for numdirs=3:numel(thedirs) +i=i+1; +disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)]) +fastafilename=[input_directory '/' thedirs(numdirs).name]; +[loadfasta,~]=fastaread(fastafilename); +numreads=numel(loadfasta); +xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda); + +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array +namescell=headers(nonzeroentries); %Get the names of the sequences +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively) + +[a cols]=size(namesandproportions); +amount=zeros(cols,1); +for j=1:cols + names{j}=['s' namesandproportions{1,j}]; + amount(j)=namesandproportions{2,j}; + if strcmp(keys,names{j}) + temp=species.(names{j}); + temp(i)=round(amount(j).*numreads); + species.(names{j})=temp; + else + temp=zeros(numel(thedirs)-3+1,1); + temp(i)=round(amount(j).*numreads); + if temp(i)==0 + %insignificant counts, do nothing + else + species.(names{j})=temp; + keys{end+1}=names{j}; + end + end +end + +thefa=strfind(thedirs(numdirs).name,'.fa'); + +if ~isempty(thedirs(numdirs).name(1:thefa-1)) + sampleid{i}=thedirs(numdirs).name(1:thefa-1); +else + sampleid{i}='empty_sampleid'; +end + +thetime(i+1)=toc(); +thetime(i+1) + +end + +disp('Total time to compute Quikr:') +toc() +disp('Quikr Average time per file:') +mean(diff(thetime(1:i+1))) + +tic() +numits=i; + +fid=fopen([output_directory '/' otu_table_name],'w'); +fprintf(fid,'# QIIME vGail OTU table\n'); +fprintf(fid,'#OTU_ID\t'); +for i=1:numits +if iotu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013 +%This is an example of how to run Quikr on the default RDP_7 training set + +%make sure Matlab/Octave is in your path +%cd /path/to/Quikr + +%User-defined variables +input_directory='../../separated_samples'; %path to input directory of samples +output_directory='quikr_results'; %path to where want output files to go +otu_table_name='rdp_otu_table.txt'; %name of output otu_table filename +%Do not have to define trainingdatabase file here +[headers,~]=fastaread('../../../data/trainset7_112011.fa'); %read in the training database + +mkdir([output_directory]) +thedirs=dir([input_directory]); +thetime=zeros(numel(thedirs)-1,1); +names={}; + + +if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab + +%This is Octave Version + + +species=struct(); +keys={}; + +tic() + +i=0; +%for numdirs=3:5 +for numdirs=3:numel(thedirs) +i=i+1; +disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)]) +fastafilename=[input_directory '/' thedirs(numdirs).name]; +[loadfasta,~]=fastaread(fastafilename); +numreads=numel(loadfasta); +xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies + +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array +namescell=headers(nonzeroentries); %Get the names of the sequences +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively) + +[a cols]=size(namesandproportions); +amount=zeros(cols,1); +for j=1:cols + names{j}=namesandproportions{1,j}(1:strfind(namesandproportions{1,j},' ')-1); + names{j}=strrep(names{j},'|','_'); + amount(j)=namesandproportions{2,j}; +if strcmp(keys,names{j}) + temp=species.(names{j}); + temp(i)=round(amount(j).*numreads); + species.(names{j})=temp; + else + temp=zeros(numel(thedirs)-3+1,1); + temp(i)=round(amount(j).*numreads); + if temp(i)==0 + % do not make a key -- has insignificant counts + else + species.(names{j})=temp; + keys{end+1}=names{j}; + end + end +end + +thefa=strfind(thedirs(numdirs).name,'.fa'); + +if ~isempty(thedirs(numdirs).name(1:thefa-1)) + sampleid{i}=thedirs(numdirs).name(1:thefa-1); +else + sampleid{i}='empty_sampleid'; +end +thetime(i+1)=toc(); +thetime(i+1) + +end + +disp('Total time to compute Quikr:') +toc() +disp('Quikr Average time per file:') +mean(diff(thetime(1:i+1))) + +tic(); +numits=i; + + +fid=fopen([output_directory '/' otu_table_name],'w'); +fprintf(fid,'# QIIME vGail OTU table\n'); +fprintf(fid,'#OTU_ID\t'); +for i=1:numits +if i