aboutsummaryrefslogtreecommitdiff
path: root/src/matlab/multifasta2otu/multifasta2otutable_gg1194.m
blob: a2872a40e795c1c4d999e28e4d20e746e61eccb4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
%This is an example of how to run Multifasta Quikr with a custom 
%training database (in this case Greengenes OTU's within 94% identity)

%make sure Matlab/Octave is in your path
%cd /path/to/Quikr

%User-defined variables
input_directory='../../separated_samples'; %path to input directory of samples
output_directory='quikr_results'; %path to where want output files to go
otu_table_name='gg1194_otu_table.txt'; %name of output otu_table filename
trainingdatabasefilename='../gg_94_otus_4feb2011.fasta'; %full path to the FASTA file you wish to use as a training database


mkdir([output_directory])
thedirs=dir([input_directory]);
thetime=zeros(numel(thedirs)-1,1);
names={};


tic()
k=6; %pick a k-mer size
trainingmatrix=quikrTrain(trainingdatabasefilename,k); %this will return the training database
'Training time:'
[headers,~]=fastaread(trainingdatabasefilename); %read in the training database
lambda=10000; 
training_time=toc()

species=containers.Map;

tic()


i=0;
%for numdirs=3:5
for numdirs=3:numel(thedirs)
i=i+1;
[num2str(i) ' out of ' num2str(numel(thedirs)-2)]
fastafilename=[input_directory '/' thedirs(numdirs).name];
[loadfasta,~]=fastaread(fastafilename);
numreads=numel(loadfasta);
xstar=quikrCustomTrained(trainingmatrix,fastafilename,k,lambda);

nonzeroentries=find(xstar); %get the indicies of the sequences quikr predicts are in your sample
proportionscell=num2cell(xstar(nonzeroentries)); %convert the concentrations into a cell array
namescell=headers(nonzeroentries); %Get the names of the sequences
namesandproportions={namescell{:}; proportionscell{:}}; %This cell array contains the (unsorted) names of the reconstructed sequences and their concentrations (in the first and second columns respectively)

[a cols]=size(namesandproportions);
amount=zeros(cols,1);
for j=1:cols
  names{j}=namesandproportions{1,j};
  amount(j)=namesandproportions{2,j};
  if isKey(species,names{j})
	 temp=species(names{j});
      temp(i)=round(amount(j).*numreads);
      species(names{j})=temp;
  else
      temp=zeros(numel(thedirs)-3+1,1);
      temp(i)=round(amount(j).*numreads);
      species(names{j})=temp;
  end
end

thefa=strfind(thedirs(numdirs).name,'.fa');

if ~isempty(thedirs(numdirs).name(1:thefa-1))
	sampleid{i}=thedirs(numdirs).name(1:thefa-1);
else
	sampleid{i}='empty_sampleid';
end

thetime(i+1)=toc();
thetime(i+1)

end

'Total time to compute Quikr'
toc()
'Quickr Average time per file'
mean(diff(thetime(1:i+1)))

tic
numits=i;

fid=fopen([output_directory '/' otu_table_name],'w');
fprintf(fid,'# QIIME vGail OTU table\n');
fprintf(fid,'#OTU_ID\t');
for i=1:numits
if i<numits
fprintf(fid,'%s\t',sampleid{i});
else
fprintf(fid,'%s',sampleid{i});
end
end
fprintf(fid,'\n');

thekeys=species.keys;
for k=1:species.Count
 fprintf(fid,'%s',thekeys{k})
 temp(:,k)=species(thekeys{k});
        for i=1:numits
                fprintf(fid,'\t%d',temp(i,k));
        end
fprintf(fid,'\n');
end
fclose(fid);

'Time to output OTU Table'
toc