diff options
| author | Calvin <calvin@EESI> | 2013-05-03 16:33:46 -0400 | 
|---|---|---|
| committer | Calvin <calvin@EESI> | 2013-05-03 16:33:46 -0400 | 
| commit | 0594196bd9f9980e78d5560f6b16a756ad462e79 (patch) | |
| tree | 2535da60b3453fd6f92c3eb855d7fde6bd5903dc /src/matlab | |
| parent | 4dd2e6e71e95fb9f673f7a6808c14de2fcc2a7e8 (diff) | |
added a contributor
Diffstat (limited to 'src/matlab')
| -rw-r--r-- | src/matlab/multifasta2otu/multi_rdp_final.m | 207 | ||||
| -rw-r--r-- | src/matlab/multifasta2otu/multifasta2otutable_gg1194.m | 109 | ||||
| -rw-r--r-- | src/matlab/multifasta2otu/multifasta2otutable_gg1194_octave.m | 109 | ||||
| -rw-r--r-- | src/matlab/multifasta2otu/multifasta2otutable_rdp.m | 105 | ||||
| -rw-r--r-- | src/matlab/multifasta2otu/multifasta2otutable_rdp7.m | 105 | ||||
| -rw-r--r-- | src/matlab/multifasta2otu/multifasta2otutable_rdp7_octave.m | 107 | 
6 files changed, 742 insertions, 0 deletions
| diff --git a/src/matlab/multifasta2otu/multi_rdp_final.m b/src/matlab/multifasta2otu/multi_rdp_final.m new file mode 100644 index 0000000..74deeff --- /dev/null +++ b/src/matlab/multifasta2otu/multi_rdp_final.m @@ -0,0 +1,207 @@ +%This is an example of how to run Quikr on the default RDP_7 training set
 +
 +%make sure Matlab/Octave is in your path
 +%cd /path/to/Quikr
 +
 +%User-defined variables
 +input_directory='../../separated_samples'; %path to input directory of samples
 +output_directory='quikr_results'; %path to where want output files to go
 +otu_table_name='rdp_otu_table.txt'; %name of output otu_table filename
 +%Do not have to define trainingdatabase file here
 +[headers,~]=fastaread('trainset7_112011.fa'); %read in the training database
 +
 +mkdir([output_directory])
 +thedirs=dir([input_directory]);
 +thetime=zeros(numel(thedirs)-1,1);
 +names={};
 +
 +
 +if(exist('OCTAVE_VERSION')) %check to see if running Octave or Matlab
 +
 +%This is Octave Version
 +
 +
 +species=struct();
 +keys={};
 +
 +tic()
 +
 +i=0;
 +%for numdirs=3:5
 +for numdirs=3:numel(thedirs)
 +i=i+1;
 +disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
 +fastafilename=[input_directory '/' thedirs(numdirs).name];
 +[loadfasta,~]=fastaread(fastafilename);
 +numreads=numel(loadfasta);
 +xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
 +
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 +
 +[a cols]=size(namesandproportions);
 +amount=zeros(cols,1);
 +for j=1:cols
 +  names{j}=namesandproportions{1,j}(1:strfind(namesandproportions{1,j},' ')-1);
 +  names{j}=strrep(names{j},'|','_');
 +  amount(j)=namesandproportions{2,j};
 +if strcmp(keys,names{j})
 +	temp=species.(names{j});
 +      	temp(i)=round(amount(j).*numreads);
 +      	species.(names{j})=temp;
 +  else
 +      temp=zeros(numel(thedirs)-3+1,1);
 +      temp(i)=round(amount(j).*numreads);
 +      if temp(i)==0
 +	% do not make a key -- has insignificant counts
 +      else
 +      species.(names{j})=temp;
 +      keys{end+1}=names{j};
 +      end
 +  end
 +end
 +
 +thefa=strfind(thedirs(numdirs).name,'.fa');
 +
 +if ~isempty(thedirs(numdirs).name(1:thefa-1))
 +	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
 +else
 +	sampleid{i}='empty_sampleid';
 +end
 +thetime(i+1)=toc();
 +thetime(i+1)
 +
 +end
 +
 +disp('Total time to compute Quikr:')
 +toc()
 +disp('Quickr Average time per file:')
 +mean(diff(thetime(1:i+1)))
 +
 +tic();
 +numits=i;
 +
 +
 +fid=fopen([output_directory '/' otu_table_name],'w');
 +fprintf(fid,'# QIIME vGail OTU table\n');
 +fprintf(fid,'#OTU_ID\t');
 +for i=1:numits
 +if i<numits
 +fprintf(fid,'%s\t',sampleid{i});
 +else
 +fprintf(fid,'%s',sampleid{i});
 +end
 +end
 +fprintf(fid,'\n');
 +
 +for k=1:numel(keys)
 +truncname=strrep(keys{k},'_','|');
 +  fprintf(fid,'%s',truncname);
 +
 +  temp(:,k)=species.(keys{k});
 +        for i=1:numits
 +		fprintf(fid,'\t%d',temp(i,k));
 +	end
 +fprintf(fid,'\n');
 +end
 +fclose(fid);
 +
 +disp('Time to output OTU Table:')
 +toc()
 +
 +else
 +
 +%This is the Matlab version
 +
 +
 +species=containers.Map;
 +
 +tic()
 +
 +i=0;
 +%for numdirs=3:6
 +for numdirs=3:numel(thedirs)
 +i=i+1;
 +[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
 +fastafilename=[input_directory '/' thedirs(numdirs).name];
 +[loadfasta,~]=fastaread(fastafilename);
 +numreads=numel(loadfasta);
 +xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
 +
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 +
 +[a cols]=size(namesandproportions);
 +amount=zeros(cols,1);
 +for j=1:cols
 +  names{j}=namesandproportions{1,j};
 +  amount(j)=namesandproportions{2,j};
 +  if isKey(species,names{j})
 +      temp=species(names{j});
 +      temp(i)=round(amount(j).*numreads);
 +      species(names{j})=temp;
 +  else
 +      temp=zeros(numel(thedirs)-3+1,1);
 +      temp(i)=round(amount(j).*numreads);
 +      species(names{j})=temp;
 +  end
 +end
 +
 +thefa=strfind(thedirs(numdirs).name,'.fa');
 +
 +if ~isempty(thedirs(numdirs).name(1:thefa-1))
 +	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
 +else
 +	sampleid{i}='empty_sampleid';
 +end
 +thetime(i+1)=toc();
 +thetime(i+1)
 +
 +end
 +
 +'Total time to compute Quikr:'
 +toc()
 +'Quickr Average time per file:'
 +mean(diff(thetime(1:i+1)))
 +
 +tic
 +numits=i;
 +
 +
 +fid=fopen([output_directory '/' otu_table_name],'w');
 +fprintf(fid,'# QIIME vGail OTU table\n');
 +fprintf(fid,'#OTU_ID\t');
 +for i=1:numits
 +if i<numits
 +fprintf(fid,'%s\t',sampleid{i});
 +else
 +fprintf(fid,'%s',sampleid{i});
 +end
 +end
 +fprintf(fid,'\n');
 +
 +thekeys=species.keys;
 +for k=1:species.Count
 +if k==1
 +  space=thekeys{k}(20);
 +end
 +delimit =strfind(thekeys{k},space);
 +truncname=thekeys{k}(1:delimit-1);
 +  fprintf(fid,'%s',truncname);
 +
 +  temp(:,k)=species(thekeys{k});
 +        for i=1:numits
 +		fprintf(fid,'\t%d',temp(i,k));
 +	end
 +fprintf(fid,'\n');
 +end
 +fclose(fid);
 +
 +'Time to output OTU Table:'
 +toc
 +
 +end
 diff --git a/src/matlab/multifasta2otu/multifasta2otutable_gg1194.m b/src/matlab/multifasta2otu/multifasta2otutable_gg1194.m new file mode 100644 index 0000000..a2872a4 --- /dev/null +++ b/src/matlab/multifasta2otu/multifasta2otutable_gg1194.m @@ -0,0 +1,109 @@ +%This is an example of how to run Multifasta Quikr with a custom 
 +%training database (in this case Greengenes OTU's within 94% identity)
 +
 +%make sure Matlab/Octave is in your path
 +%cd /path/to/Quikr
 +
 +%User-defined variables
 +input_directory='../../separated_samples'; %path to input directory of samples
 +output_directory='quikr_results'; %path to where want output files to go
 +otu_table_name='gg1194_otu_table.txt'; %name of output otu_table filename
 +trainingdatabasefilename='../gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database
 +
 +
 +mkdir([output_directory])
 +thedirs=dir([input_directory]);
 +thetime=zeros(numel(thedirs)-1,1);
 +names={};
 +
 +
 +tic()
 +k=6; %pick a k-mer size
 +trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
 +'Training time:'
 +[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
 +lambda=10000; 
 +training_time=toc()
 +
 +species=containers.Map;
 +
 +tic()
 +
 +
 +i=0;
 +%for numdirs=3:5
 +for numdirs=3:numel(thedirs)
 +i=i+1;
 +[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
 +fastafilename=[input_directory '/' thedirs(numdirs).name];
 +[loadfasta,~]=fastaread(fastafilename);
 +numreads=numel(loadfasta);
 +xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
 +
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 +
 +[a cols]=size(namesandproportions);
 +amount=zeros(cols,1);
 +for j=1:cols
 +  names{j}=namesandproportions{1,j};
 +  amount(j)=namesandproportions{2,j};
 +  if isKey(species,names{j})
 +	 temp=species(names{j});
 +      temp(i)=round(amount(j).*numreads);
 +      species(names{j})=temp;
 +  else
 +      temp=zeros(numel(thedirs)-3+1,1);
 +      temp(i)=round(amount(j).*numreads);
 +      species(names{j})=temp;
 +  end
 +end
 +
 +thefa=strfind(thedirs(numdirs).name,'.fa');
 +
 +if ~isempty(thedirs(numdirs).name(1:thefa-1))
 +	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
 +else
 +	sampleid{i}='empty_sampleid';
 +end
 +
 +thetime(i+1)=toc();
 +thetime(i+1)
 +
 +end
 +
 +'Total time to compute Quikr'
 +toc()
 +'Quickr Average time per file'
 +mean(diff(thetime(1:i+1)))
 +
 +tic
 +numits=i;
 +
 +fid=fopen([output_directory '/' otu_table_name],'w');
 +fprintf(fid,'# QIIME vGail OTU table\n');
 +fprintf(fid,'#OTU_ID\t');
 +for i=1:numits
 +if i<numits
 +fprintf(fid,'%s\t',sampleid{i});
 +else
 +fprintf(fid,'%s',sampleid{i});
 +end
 +end
 +fprintf(fid,'\n');
 +
 +thekeys=species.keys;
 +for k=1:species.Count
 + fprintf(fid,'%s',thekeys{k})
 + temp(:,k)=species(thekeys{k});
 +        for i=1:numits
 +                fprintf(fid,'\t%d',temp(i,k));
 +        end
 +fprintf(fid,'\n');
 +end
 +fclose(fid);
 +
 +'Time to output OTU Table'
 +toc
 diff --git a/src/matlab/multifasta2otu/multifasta2otutable_gg1194_octave.m b/src/matlab/multifasta2otu/multifasta2otutable_gg1194_octave.m new file mode 100644 index 0000000..c035d48 --- /dev/null +++ b/src/matlab/multifasta2otu/multifasta2otutable_gg1194_octave.m @@ -0,0 +1,109 @@ +%This is an example of how to run Multifasta Quikr with a custom 
 +%training database (in this case Greengenes OTU's within 94% identity)
 +
 +%make sure Matlab/Octave is in your path
 +%cd /path/to/Quikr
 +
 +%User-defined variables
 +input_directory='../separated_samples'; %path to input directory of samples
 +output_directory='quikr_results'; %path to where want output files to go
 +otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename
 +trainingdatabasefilename='gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database
 +
 +
 +mkdir([output_directory]);
 +thedirs=dir([input_directory]);
 +thetime=zeros(numel(thedirs)-1,1);
 +names={};
 +
 +tic();
 +k=6; %pick a k-mer size
 +trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
 +disp('Training time:')
 +[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
 +lambda=10000; 
 +training_time=toc()
 +
 +species=struct();
 +keys={};
 +
 +tic();
 +
 +
 +i=0;
 +%for numdirs=3:5
 +for numdirs=3:numel(thedirs)
 +i=i+1;
 +disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
 +fastafilename=[input_directory '/' thedirs(numdirs).name];
 +[loadfasta,~]=fastaread(fastafilename);
 +numreads=numel(loadfasta);
 +xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
 +
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 +
 +[a cols]=size(namesandproportions);
 +amount=zeros(cols,1);
 +for j=1:cols
 +  names{j}=['s' namesandproportions{1,j}];
 +  amount(j)=namesandproportions{2,j};
 +  if strcmp(keys,names{j})
 +	temp=species.(names{j});
 +      	temp(i)=round(amount(j).*numreads);
 +      	species.(names{j})=temp;
 +  else
 +      temp=zeros(numel(thedirs)-3+1,1);
 +      temp(i)=round(amount(j).*numreads);
 +      species.(names{j})=temp;
 +      keys{end+1}=names{j};
 +  end
 +end
 +
 +thefa=strfind(thedirs(numdirs).name,'.fa');
 +
 +if ~isempty(thedirs(numdirs).name(1:thefa-1))
 +	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
 +else
 +	sampleid{i}='empty_sampleid';
 +end
 +
 +thetime(i+1)=toc();
 +thetime(i+1)
 +
 +end
 +
 +disp('Total time to compute Quikr:')
 +toc()
 +disp('Quickr Average time per file:')
 +mean(diff(thetime(1:i+1)))
 +
 +tic()
 +numits=i;
 +
 +fid=fopen([output_directory '/' otu_table_name],'w');
 +fprintf(fid,'# QIIME vGail OTU table\n');
 +fprintf(fid,'#OTU_ID\t');
 +for i=1:numits
 +if i<numits
 +fprintf(fid,'%s\t',sampleid{i});
 +else
 +fprintf(fid,'%s',sampleid{i});
 +end
 +end
 +fprintf(fid,'\n');
 +
 +for k=1:numel(keys)
 + fprintf(fid,'%s',keys{k}(2:end))
 + temp(:,k)=species.(keys{k});
 +        for i=1:numits
 +                fprintf(fid,'\t%d',temp(i,k));
 +        end
 +fprintf(fid,'\n');
 +end
 +fclose(fid);
 +
 +disp('Time to output OTU Table')
 +toc()
 diff --git a/src/matlab/multifasta2otu/multifasta2otutable_rdp.m b/src/matlab/multifasta2otu/multifasta2otutable_rdp.m new file mode 100644 index 0000000..f3225f7 --- /dev/null +++ b/src/matlab/multifasta2otu/multifasta2otutable_rdp.m @@ -0,0 +1,105 @@ +%This is an example of how to run Quikr on the default RDP_7 training set, trainset7_112011.fa
 +
 +%make sure Matlab/Octave is in your path
 +%cd /path/to/Quikr
 +
 +%User-defined variables
 +input_directory='../../separated_samples'; %path to input directory of samples
 +output_directory='quikr_results_32'; %path to where want output files to go
 +otu_table_name='rdp_otu_table.txt'; %name of output otu_table filename
 +%Do not have to define trainingdatabasename here
 +
 +mkdir([output_directory])
 +thedirs=dir([input_directory]);
 +thetime=zeros(numel(thedirs),1);
 +names={};
 +
 +species=containers.Map;
 +
 +tic()
 +
 +[headers,~]=fastaread('trainset7_112011.fa'); %read in the training database
 +
 +i=0;
 +%for numdirs=3:6
 +for numdirs=3:numel(thedirs)
 +i=i+1;
 +[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
 +fastafilename=[input_directory '/' thedirs(numdirs).name];
 +[loadfasta,~]=fastaread(fastafilename);
 +numreads=numel(loadfasta);
 +xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
 +
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 +
 +[a cols]=size(namesandproportions);
 +amount=zeros(cols,1);
 +for j=1:cols
 +  names{j}=namesandproportions{1,j};
 +  amount(j)=namesandproportions{2,j};
 +  if isKey(species,names{j})
 +      temp=species(names{j});
 +      temp(i)=round(amount(j).*numreads);
 +      species(names{j})=temp;
 +  else
 +      temp=zeros(numel(thedirs)-3+1,1);
 +      temp(i)=round(amount(j).*numreads);
 +      species(names{j})=temp;
 +  end
 +end
 +
 +thefa=strfind(fastafilename,'.fa');
 +
 +if ~isempty(fastafilename(1:thefa-1))
 +	sampleid{i}=fastafilename(1:thefa-1);
 +else
 +	sampleid{i}='empty_sampleid';
 +end
 +thetime(i)=toc();
 +thetime(i)
 +
 +end
 +
 +'Total time to compute Quikr'
 +toc()
 +'Quickr Average time per file'
 +mean(diff(thetime))
 +
 +tic
 +numits=i;
 +
 +
 +fid=fopen([output_directory '/' otu_table_name],'w');
 +fprintf(fid,'# QIIME vGail OTU table\n');
 +fprintf(fid,'#OTU_ID\t');
 +for i=1:numits
 +if i<numits
 +fprintf(fid,'%s\t',sampleid{i});
 +else
 +fprintf(fid,'%s',sampleid{i});
 +end
 +end
 +fprintf(fid,'\n');
 +
 +thekeys=species.keys;
 +for k=1:species.Count
 +if k==1
 +  space=thekeys{k}(20);
 +end
 +delimit =strfind(thekeys{k},space);
 +truncname=thekeys{k}(1:delimit-1);
 +  fprintf(fid,'%s',truncname);
 +
 +  temp(:,k)=species(thekeys{k});
 +        for i=1:numits
 +		fprintf(fid,'\t%d',temp(i,k));
 +	end
 +fprintf(fid,'\n');
 +end
 +fclose(fid);
 +
 +'Time to output OTU Table'
 +toc
 diff --git a/src/matlab/multifasta2otu/multifasta2otutable_rdp7.m b/src/matlab/multifasta2otu/multifasta2otutable_rdp7.m new file mode 100644 index 0000000..63e9508 --- /dev/null +++ b/src/matlab/multifasta2otu/multifasta2otutable_rdp7.m @@ -0,0 +1,105 @@ +%This is an example of how to run Quikr on the default RDP_7 training set, trainset7_112011.fa
 +
 +%make sure Matlab/Octave is in your path
 +%cd /path/to/Quikr
 +
 +%User-defined variables
 +input_directory='../../separated_samples'; %path to input directory of samples
 +output_directory='quikr_results'; %path to where want output files to go
 +otu_table_name='rdp_otu_table.txt'; %name of output otu_table filename
 +%Do not have to define trainingdatabasename here
 +
 +mkdir([output_directory])
 +thedirs=dir([input_directory]);
 +thetime=zeros(numel(thedirs)-1,1);
 +names={};
 +
 +species=containers.Map;
 +
 +tic()
 +
 +[headers,~]=fastaread('trainset7_112011.fa'); %read in the training database
 +
 +i=0;
 +%for numdirs=3:6
 +for numdirs=3:numel(thedirs)
 +i=i+1;
 +[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
 +fastafilename=[input_directory '/' thedirs(numdirs).name];
 +[loadfasta,~]=fastaread(fastafilename);
 +numreads=numel(loadfasta);
 +xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
 +
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 +
 +[a cols]=size(namesandproportions);
 +amount=zeros(cols,1);
 +for j=1:cols
 +  names{j}=namesandproportions{1,j};
 +  amount(j)=namesandproportions{2,j};
 +  if isKey(species,names{j})
 +      temp=species(names{j});
 +      temp(i)=round(amount(j).*numreads);
 +      species(names{j})=temp;
 +  else
 +      temp=zeros(numel(thedirs)-3+1,1);
 +      temp(i)=round(amount(j).*numreads);
 +      species(names{j})=temp;
 +  end
 +end
 +
 +thefa=strfind(thedirs(numdirs).name,'.fa');
 +
 +if ~isempty(thedirs(numdirs).name(1:thefa-1))
 +	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
 +else
 +	sampleid{i}='empty_sampleid';
 +end
 +thetime(i+1)=toc();
 +thetime(i+1)
 +
 +end
 +
 +'Total time to compute Quikr'
 +toc()
 +'Quickr Average time per file'
 +mean(diff(thetime(1:i+1)))
 +
 +tic
 +numits=i;
 +
 +
 +fid=fopen([output_directory '/' otu_table_name],'w');
 +fprintf(fid,'# QIIME vGail OTU table\n');
 +fprintf(fid,'#OTU_ID\t');
 +for i=1:numits
 +if i<numits
 +fprintf(fid,'%s\t',sampleid{i});
 +else
 +fprintf(fid,'%s',sampleid{i});
 +end
 +end
 +fprintf(fid,'\n');
 +
 +thekeys=species.keys;
 +for k=1:species.Count
 +if k==1
 +  space=thekeys{k}(20);
 +end
 +delimit =strfind(thekeys{k},space);
 +truncname=thekeys{k}(1:delimit-1);
 +  fprintf(fid,'%s',truncname);
 +
 +  temp(:,k)=species(thekeys{k});
 +        for i=1:numits
 +		fprintf(fid,'\t%d',temp(i,k));
 +	end
 +fprintf(fid,'\n');
 +end
 +fclose(fid);
 +
 +'Time to output OTU Table'
 +toc
 diff --git a/src/matlab/multifasta2otu/multifasta2otutable_rdp7_octave.m b/src/matlab/multifasta2otu/multifasta2otutable_rdp7_octave.m new file mode 100644 index 0000000..f27b24b --- /dev/null +++ b/src/matlab/multifasta2otu/multifasta2otutable_rdp7_octave.m @@ -0,0 +1,107 @@ +%This is an example of how to run Quikr on the default RDP_7 training set
 +
 +%make sure Matlab/Octave is in your path
 +%cd /path/to/Quikr
 +
 +%User-defined variables
 +input_directory='../../separated_samples'; %path to input directory of samples
 +output_directory='quikr_results'; %path to where want output files to go
 +otu_table_name='rdp_otu_table.txt'; %name of output otu_table filename
 +%Do not have to define trainingdatabase file here
 +
 +mkdir([output_directory])
 +thedirs=dir([input_directory]);
 +thetime=zeros(numel(thedirs)-1,1);
 +names={};
 +
 +species=struct();
 +keys={};
 +
 +tic()
 +
 +[headers,~]=fastaread('trainset7_112011.fa'); %read in the training database
 +
 +i=0;
 +%for numdirs=3:5
 +for numdirs=3:numel(thedirs)
 +i=i+1;
 +disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
 +fastafilename=[input_directory '/' thedirs(numdirs).name];
 +[loadfasta,~]=fastaread(fastafilename);
 +numreads=numel(loadfasta);
 +xstar=quikr(fastafilename); %this will give the predicted reconstruction frequencies
 +
 +nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
 +proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
 +namescell=headers(nonzeroentries); %Get the names of the sequences
 +namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
 +
 +[a cols]=size(namesandproportions);
 +amount=zeros(cols,1);
 +for j=1:cols
 +  names{j}=namesandproportions{1,j}(1:strfind(namesandproportions{1,j},' ')-1);
 +  names{j}=strrep(names{j},'|','_');
 +  amount(j)=namesandproportions{2,j};
 +if strcmp(keys,names{j})
 +	temp=species.(names{j});
 +      	temp(i)=round(amount(j).*numreads);
 +      	species.(names{j})=temp;
 +  else
 +      temp=zeros(numel(thedirs)-3+1,1);
 +      temp(i)=round(amount(j).*numreads);
 +      if temp(i)==0
 +	% do not make a key -- has insignificant counts
 +      else
 +      species.(names{j})=temp;
 +      keys{end+1}=names{j};
 +      end
 +  end
 +end
 +
 +thefa=strfind(thedirs(numdirs).name,'.fa');
 +
 +if ~isempty(thedirs(numdirs).name(1:thefa-1))
 +	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
 +else
 +	sampleid{i}='empty_sampleid';
 +end
 +thetime(i+1)=toc();
 +thetime(i+1)
 +
 +end
 +
 +disp('Total time to compute Quikr:')
 +toc()
 +disp('Quickr Average time per file:')
 +mean(diff(thetime(1:i+1)))
 +
 +tic();
 +numits=i;
 +
 +
 +fid=fopen([output_directory '/' otu_table_name],'w');
 +fprintf(fid,'# QIIME vGail OTU table\n');
 +fprintf(fid,'#OTU_ID\t');
 +for i=1:numits
 +if i<numits
 +fprintf(fid,'%s\t',sampleid{i});
 +else
 +fprintf(fid,'%s',sampleid{i});
 +end
 +end
 +fprintf(fid,'\n');
 +
 +for k=1:numel(keys)
 +truncname=strrep(keys{k},'_','|');
 +  fprintf(fid,'%s',truncname);
 +
 +  temp(:,k)=species.(keys{k});
 +        for i=1:numits
 +		fprintf(fid,'\t%d',temp(i,k));
 +	end
 +fprintf(fid,'\n');
 +end
 +fclose(fid);
 +
 +disp('Time to output OTU Table')
 +toc()
 | 
