1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
|
function [data, seq] = fastaread(filename,varargin)
%FASTAREAD reads FASTA format file.
%
% S = FASTAREAD(FILENAME) reads a FASTA format file FILENAME, returning
% the data in the file as a structure. FILENAME can also be a URL or
% MATLAB character array that contains the text of a FASTA format file.
% S.Header is the header information. S.Sequence is the sequence stored
% as a string of characters.
%
% [HEADER, SEQ] = FASTAREAD(FILENAME) reads the file into separate
% variables HEADER and SEQ. If the file contains more than one sequence,
% then HEADER and SEQ are cell arrays of header and sequence information.
%
% FASTAREAD(...,'IGNOREGAPS',TF) removes any gap symbol ('-' or '.')
% from the sequence(s) when TF is true. Default is false.
%
% FASTAREAD(...,'BLOCKREAD', M) allows you to read in a single entry or
% block of entries from a file containing multiple sequences. If M is a
% scalar then the M'th entry in the file is read. If M is a two element
% vector then the block of entries starting at entry M(1) and ending at
% entry M(2) will be read. Use Inf for M(2) to read all entries in the
% file starting at position M(1).
%
% FASTAREAD(...,'TRIMHEADERS',TF) trims the header after the first
% whitespace when TF is true. White space characters include a horizontal
% tab (char(9)) and a space (char(32)). Default is false.
%
% FASTA format specified here:
% http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
%
% Examples:
%
% % Read the sequence for the human p53 tumor gene.
% p53nt = fastaread('p53nt.txt')
%
% % Read the sequence for the human p53 tumor protein.
% p53aa = fastaread('p53aa.txt')
%
% % Read a block of entries from a file
% pf2_5_10 = fastaread('pf00002.fa','blockread',[ 5 10], ...
% 'ignoregaps',true)
%
% % Read the human mitochondrion genome in FASTA format.
% entrezSite = 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?'
% textOptions = '&txt=on&view=fasta'
% genbankID = '&list_uids=NC_012920'
% mitochondrion = fastaread([entrezSite textOptions genbankID])
%
% See also EMBLREAD, FASTAINFO, FASTAWRITE, FASTQINFO, FASTQREAD,
% FASTQWRITE, GENBANKREAD, GENPEPTREAD, HMMPROFDEMO, MULTIALIGNREAD,
% MULTIALIGNVIEWER, SEQPROFILE, SEQTOOL, SFFINFO, SFFREAD.
% Copyright 2002-2010 The MathWorks, Inc.
% $Revision: 1.15.4.21 $ $Date: 2010/06/26 04:54:38 $
% check input is char
% in a future version we may accept also cells
if ~ischar(filename)
error('Bioinfo:fastaread:InvalidInput','Input must be a character array')
end
% default
ignoreGaps = false;
trimh = false;
blockRead = false;
% get input arguments
if nargin > 1
if rem(nargin,2) ~= 1
error('Bioinfo:fastaread:IncorrectNumberOfArguments',...
'Incorrect number of arguments to %s.',mfilename);
end
okargs = {'ignoregaps','blockread','trimheaders'};
for j=1:2:nargin-1
pname = varargin{j};
pval = varargin{j+1};
k = find(strncmpi(pname,okargs,numel(pname)));
if isempty(k)
error('Bioinfo:fastaread:UnknownParameterName',...
'Unknown parameter name: %s.',pname);
%elseif length(k)>1
% error('Bioinfo:fastaread:AmbiguousParameterName',...
% 'Ambiguous parameter name: %s.',pname);
else
switch(k)
case 1 % ignore gaps
ignoreGaps = opttf(pval);
if isempty(ignoreGaps)
error('Bioinfo:fastaread:IgnoreGapsNotLogical',...
'%s must be a logical value, true or false.',...
upper(char(okargs(k))));
end
case 2 % range
range = pval;
if ~isnumeric(range) || numel(range)> 2 || isempty(range)
error('Bioinfo:fastaread:BadBlockRange',...
'BlockRange must be numeric scalar or vector with two values.')
end
blockRead = true;
range = sort(range);
case 3 % trimheaders
trimh = opttf(pval, okargs{k}, mfilename);
end
end
end
end
if size(filename,1)>1 % is padded string
if blockRead
warning('Bioinfo:fastaread:IgnoredRange',...
'BLOCKREAD is only allowed when parsing a file.')
end
ftext = cell(size(filename,1),1);
for i=1:size(filename,1)
ftext(i,1)=strtrim(strread(filename(i,:),'%s','whitespace','','delimiter','\n'));
end
% try then if it is an url
elseif (strfind(filename(1:min(10,end)), '://'))
if blockRead
warning('Bioinfo:fastaread:IgnoredRange',...
'BLOCKREAD is only allowed when parsing a file.')
end
if (~usejava('jvm'))
error('Bioinfo:fastaread:NoJava',...
'Reading from a URL requires Java.')
end
try
ftext = urlread(filename);
catch allExceptions
error('Bioinfo:fastaread:CannotReadURL',...
'Cannot read URL "%s".', filename);
end
ftext = strread(ftext,'%s','delimiter','\n');
% try then if it is a valid filename
elseif (exist(filename,'file') || exist(fullfile(pwd,filename),'file'))
if blockRead
blockText = getfileblock(filename,range,'>');
try
ftext = strread(blockText,'%s','delimiter','\n');
catch theErr
if strcmpi(theErr.identifier,'MATLAB:nomem')
error( 'Bioinfo:fastaread:BlockTooBig',...
'The block is too large to fit in memory. Try a smaller range.');
else
rethrow(theErr);
end
end
else %=== read entire file
fid = fopen(filename);
c = onCleanup(@()fclose(fid));
try
ftext = textscan(fid,'%s','delimiter','\n','bufsize',10000000);
ftext = ftext{:};
catch theErr
if strcmpi(theErr.identifier,'MATLAB:nomem')
error( 'Bioinfo:fastaread:FileTooBig',...
'The file is too large to fit in memory. Try using the BLOCKREAD option to read the file in blocks.');
else
rethrow(theErr);
end
end
end
else % must be a string with '\n', convert to cell
if blockRead
warning('Bioinfo:fastaread:IgnoredRange',...
'BLOCKREAD is only allowed when parsing a file.')
end
ftext = strread(filename,'%s','delimiter','\n');
end
% it is possible that there will be multiple sequences
commentLines = strncmp(ftext,'>',1);
if ~any(commentLines)
error('Bioinfo:fastaread:FastaNotValid',...
'Input does not exist or is not a valid FASTA file.')
end
numSeqs = sum(commentLines);
seqStarts = [find(commentLines); size(ftext,1)+1];
data(numSeqs,1).Header = '';
try
for theSeq = 1:numSeqs
% Check for > symbol ?
data(theSeq).Header = ftext{seqStarts(theSeq)}(2:end);
firstRow = seqStarts(theSeq)+1;
lastRow = seqStarts(theSeq+1)-1;
numChars = cellfun('length',ftext(firstRow:lastRow));
numSymbols = sum(numChars);
data(theSeq).Sequence = repmat(' ',1,numSymbols);
pos = 1;
for i=firstRow:lastRow,
str = strtrim(ftext{i});
len = length(str);
if len == 0
break
end
data(theSeq).Sequence(pos:pos+len-1) = str;
pos = pos+len;
end
data(theSeq).Sequence = strtrim(data(theSeq).Sequence);
if ignoreGaps
data(theSeq).Sequence = strrep(data(theSeq).Sequence,'-','');
data(theSeq).Sequence = strrep(data(theSeq).Sequence,'.','');
end
end
% trim headers
if trimh
for i = 1:numSeqs
data(i).Header = sscanf(data(i).Header,'%s',1);
end
end
% in case of two outputs
if nargout == 2
if numSeqs == 1
seq = data.Sequence;
data = data.Header;
else
seq = {data(:).Sequence};
data = {data(:).Header};
end
end
catch allExceptions
error('Bioinfo:fastaread:IncorrectDataFormat',...
'Incorrect data format in fasta file')
end
|