multifasta_documented.m


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

% Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013
%This is an example of how to run Multifasta Quikr with a custom 
%training database (in this case Greengenes OTU's within 94% identity)

%make sure Matlab/Octave is in your path
%cd /path/to/Quikr

%User-defined variables
input_directory='py-quikr/input/'; %path to input directory of samples
output_directory='quikr_results'; %path to where want output files to go
otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename
trainingdatabasefilename='gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database
k=6; %pick a k-mer size
addpath('quikr-code');

% make output directory
mkdir([output_directory]);

% get a list of the input directory

thedirs=dir([input_directory]);
thetime=zeros(numel(thedirs)-1,1);
names={};

tic();

trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
disp('Training time:')
[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
lambda=10000; 
training_time=toc()

species=struct();
keys={};

tic();


i=0;
%for numdirs=3:5
for numdirs=3:numel(thedirs)
i=i+1;
disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
fastafilename=[input_directory '/' thedirs(numdirs).name];
[loadfasta,~]=fastaread(fastafilename);
numreads=numel(loadfasta);
xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);

nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
namescell=headers(nonzeroentries); %Get the names of the sequences
namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)

[a cols]=size(namesandproportions);
amount=zeros(cols,1);
for j=1:cols
  names{j}=['s' namesandproportions{1,j}];
  amount(j)=namesandproportions{2,j};
  if strcmp(keys,names{j})
	temp=species.(names{j});
      	temp(i)=round(amount(j).*numreads);
      	species.(names{j})=temp;
  else
      temp=zeros(numel(thedirs)-3+1,1);
      temp(i)=round(amount(j).*numreads);
      if temp(i)==0
         %insignificant counts, do nothing
      else
         species.(names{j})=temp;
         keys{end+1}=names{j};
      end
  end
end

thefa=strfind(thedirs(numdirs).name,'.fa');

if ~isempty(thedirs(numdirs).name(1:thefa-1))
	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
else
	sampleid{i}='empty_sampleid';
end

thetime(i+1)=toc();
thetime(i+1)

end

disp('Total time to compute Quikr:')
toc()
disp('Quikr Average time per file:')
mean(diff(thetime(1:i+1)))

tic()
numits=i;

fid=fopen([output_directory '/' otu_table_name],'w');

% insert our header
fprintf(fid,'# QIIME vGail OTU table\n');
fprintf(fid,'#OTU_ID\t');
for i=1:numits
  if i<numits
    fprintf(fid,'%s\t',sampleid{i});
  else
    fprintf(fid,'%s',sampleid{i});
  end
end

fprintf(fid,'\n');
%


% print our key and our values for each keys
for k=1:numel(keys)
  fprintf(fid,'%s',keys{k}(2:end))
  temp(:,k)=species.(keys{k});
 
  for i=1:numits
    fprintf(fid,'\t%d',temp(i,k));
  end
  
  fprintf(fid,'\n');
end

fclose(fid);

disp('Time to output OTU Table:')
toc()