1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
% Quikr multifasta->otu_table_(for_qiime_use) wrapper code written by Gail Rosen -- 2/1/2013
%This is an example of how to run Multifasta Quikr with a custom
%training database (in this case Greengenes OTU's within 94% identity)
%make sure Matlab/Octave is in your path
%cd /path/to/Quikr
%User-defined variables
input_directory='py-quikr/input/'; %path to input directory of samples
output_directory='quikr_results'; %path to where want output files to go
otu_table_name='gg1194_otu_octave.txt'; %name of output otu_table filename
trainingdatabasefilename='gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database
k=6; %pick a k-mer size
addpath('quikr-code');
% make output directory
mkdir([output_directory]);
% get a list of the input directory
thedirs=dir([input_directory]);
thetime=zeros(numel(thedirs)-1,1);
names={};
tic();
trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
disp('Training time:')
[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
lambda=10000;
training_time=toc()
species=struct();
keys={};
tic();
i=0;
%for numdirs=3:5
for numdirs=3:numel(thedirs)
i=i+1;
disp([num2str(i) ' out of ' num2str(numel(thedirs)-2)])
fastafilename=[input_directory '/' thedirs(numdirs).name];
[loadfasta,~]=fastaread(fastafilename);
numreads=numel(loadfasta);
xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);
nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
namescell=headers(nonzeroentries); %Get the names of the sequences
namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)
[a cols]=size(namesandproportions);
amount=zeros(cols,1);
for j=1:cols
names{j}=['s' namesandproportions{1,j}];
amount(j)=namesandproportions{2,j};
if strcmp(keys,names{j})
temp=species.(names{j});
temp(i)=round(amount(j).*numreads);
species.(names{j})=temp;
else
temp=zeros(numel(thedirs)-3+1,1);
temp(i)=round(amount(j).*numreads);
if temp(i)==0
%insignificant counts, do nothing
else
species.(names{j})=temp;
keys{end+1}=names{j};
end
end
end
thefa=strfind(thedirs(numdirs).name,'.fa');
if ~isempty(thedirs(numdirs).name(1:thefa-1))
sampleid{i}=thedirs(numdirs).name(1:thefa-1);
else
sampleid{i}='empty_sampleid';
end
thetime(i+1)=toc();
thetime(i+1)
end
disp('Total time to compute Quikr:')
toc()
disp('Quikr Average time per file:')
mean(diff(thetime(1:i+1)))
tic()
numits=i;
fid=fopen([output_directory '/' otu_table_name],'w');
% insert our header
fprintf(fid,'# QIIME vGail OTU table\n');
fprintf(fid,'#OTU_ID\t');
for i=1:numits
if i<numits
fprintf(fid,'%s\t',sampleid{i});
else
fprintf(fid,'%s',sampleid{i});
end
end
fprintf(fid,'\n');
%
% print our key and our values for each keys
for k=1:numel(keys)
fprintf(fid,'%s',keys{k}(2:end))
temp(:,k)=species.(keys{k});
for i=1:numits
fprintf(fid,'\t%d',temp(i,k));
end
fprintf(fid,'\n');
end
fclose(fid);
disp('Time to output OTU Table:')
toc()
|