summaryrefslogtreecommitdiff
path: root/src/matlab/fastaread.m
blob: 37d9b7149a25b97b8ae624f0fe241f340f8ae904 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
function [data, seq] = fastaread(filename,varargin)
%FASTAREAD reads FASTA format file.
%
%   S = FASTAREAD(FILENAME) reads a FASTA format file FILENAME, returning
%   the data in the file as a structure. FILENAME can also be a URL or
%   MATLAB character array that contains the text of a FASTA format file.
%   S.Header is the header information. S.Sequence is the sequence stored
%   as a string of characters.
%
%   [HEADER, SEQ] = FASTAREAD(FILENAME) reads the file into separate
%   variables HEADER and SEQ. If the file contains more than one sequence,
%   then HEADER and SEQ are cell arrays of header and sequence information.
%
%   FASTAREAD(...,'IGNOREGAPS',TF) removes any gap symbol ('-' or '.')
%   from the sequence(s) when TF is true. Default is false.
%
%   FASTAREAD(...,'BLOCKREAD', M) allows you to read in a single entry or
%   block of entries from a file containing multiple sequences. If M is a
%   scalar then the M'th entry in the file is read. If M is a two element
%   vector then the block of entries starting at entry M(1) and ending at
%   entry M(2) will be read.  Use Inf for M(2) to read all entries in the
%   file starting at position M(1). 
%
%   FASTAREAD(...,'TRIMHEADERS',TF) trims the header after the first
%   whitespace when TF is true. White space characters include a horizontal
%   tab (char(9)) and a space (char(32)). Default is false. 
%
%   FASTA format specified here:
%   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
%
%   Examples:
%
%       % Read the sequence for the human p53 tumor gene.
%       p53nt = fastaread('p53nt.txt')
%
%       % Read the sequence for the human p53 tumor protein.
%       p53aa = fastaread('p53aa.txt')
%
%       % Read a block of entries from a file
%       pf2_5_10 = fastaread('pf00002.fa','blockread',[ 5 10], ...
%                            'ignoregaps',true)
%
%       % Read the human mitochondrion genome in FASTA format.
%       entrezSite = 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?'
%       textOptions = '&txt=on&view=fasta'
%       genbankID = '&list_uids=NC_012920'
%       mitochondrion = fastaread([entrezSite textOptions genbankID])
%
%   See also EMBLREAD, FASTAINFO, FASTAWRITE, FASTQINFO, FASTQREAD,
%   FASTQWRITE, GENBANKREAD, GENPEPTREAD, HMMPROFDEMO, MULTIALIGNREAD,
%   MULTIALIGNVIEWER, SEQPROFILE, SEQTOOL, SFFINFO, SFFREAD.

%   Copyright 2002-2010 The MathWorks, Inc.
%   $Revision: 1.15.4.21 $  $Date: 2010/06/26 04:54:38 $

% check input is char
% in a future version we may accept also cells
if ~ischar(filename)
    error('Bioinfo:fastaread:InvalidInput','Input must be a character array')
end

% default
ignoreGaps = false;
trimh = false;
blockRead = false;
% get input arguments
if  nargin > 1
    if rem(nargin,2) ~= 1
        error('Bioinfo:fastaread:IncorrectNumberOfArguments',...
            'Incorrect number of arguments to %s.',mfilename);
    end
    okargs = {'ignoregaps','blockread','trimheaders'};
    for j=1:2:nargin-1
        pname = varargin{j};
        pval = varargin{j+1};
        k = find(strncmpi(pname,okargs,numel(pname)));
        if isempty(k)
            error('Bioinfo:fastaread:UnknownParameterName',...
                'Unknown parameter name: %s.',pname);
            %elseif length(k)>1
            %    error('Bioinfo:fastaread:AmbiguousParameterName',...
            %        'Ambiguous parameter name: %s.',pname);
        else
            switch(k)
                case 1  % ignore gaps
                    ignoreGaps = opttf(pval);
                    if isempty(ignoreGaps)
                        error('Bioinfo:fastaread:IgnoreGapsNotLogical',...
                            '%s must be a logical value, true or false.',...
                            upper(char(okargs(k))));
                    end
                case 2  % range
                    range = pval;
                    if ~isnumeric(range) || numel(range)> 2 || isempty(range)
                        error('Bioinfo:fastaread:BadBlockRange',...
                            'BlockRange must be numeric scalar or vector with two values.')
                    end
                    blockRead = true;
                    range = sort(range);
                case 3 % trimheaders
			        trimh = opttf(pval, okargs{k}, mfilename);    
            end
        end
    end
end


if size(filename,1)>1  % is padded string
    if blockRead
        warning('Bioinfo:fastaread:IgnoredRange',...
            'BLOCKREAD is only allowed when parsing a file.')
    end
    ftext = cell(size(filename,1),1);
    for i=1:size(filename,1)
        ftext(i,1)=strtrim(strread(filename(i,:),'%s','whitespace','','delimiter','\n'));
    end
    % try then if it is an url
elseif (strfind(filename(1:min(10,end)), '://'))
    if blockRead
        warning('Bioinfo:fastaread:IgnoredRange',...
            'BLOCKREAD is only allowed when parsing a file.')
    end
    if (~usejava('jvm'))
        error('Bioinfo:fastaread:NoJava',...
            'Reading from a URL requires Java.')
    end
    try
        ftext = urlread(filename);
    catch allExceptions
        error('Bioinfo:fastaread:CannotReadURL',...
            'Cannot read URL "%s".', filename);
    end
    ftext = strread(ftext,'%s','delimiter','\n');

    % try then if it is a valid filename
elseif  (exist(filename,'file') || exist(fullfile(pwd,filename),'file'))
    if blockRead
        blockText = getfileblock(filename,range,'>');
        try
           ftext = strread(blockText,'%s','delimiter','\n');
        catch theErr
           if strcmpi(theErr.identifier,'MATLAB:nomem')
                error( 'Bioinfo:fastaread:BlockTooBig',...
                  'The block is too large to fit in memory. Try a smaller range.');
           else
                 rethrow(theErr);
           end
        end
    else %=== read entire file
        fid = fopen(filename);
		c = onCleanup(@()fclose(fid));
        try
            ftext = textscan(fid,'%s','delimiter','\n','bufsize',10000000);
            ftext = ftext{:}; 
        catch theErr
            if strcmpi(theErr.identifier,'MATLAB:nomem')
                error( 'Bioinfo:fastaread:FileTooBig',...
                    'The file is too large to fit in memory. Try using the BLOCKREAD option to read the file in blocks.');
            else 
                rethrow(theErr);
            end
        end
    end
else  % must be a string with '\n', convert to cell
    if blockRead
        warning('Bioinfo:fastaread:IgnoredRange',...
            'BLOCKREAD is only allowed when parsing a file.')
    end
    ftext = strread(filename,'%s','delimiter','\n');
end

% it is possible that there will be multiple sequences
commentLines = strncmp(ftext,'>',1);

if ~any(commentLines)
    error('Bioinfo:fastaread:FastaNotValid',...
        'Input does not exist or is not a valid FASTA file.')
end

numSeqs = sum(commentLines);
seqStarts = [find(commentLines); size(ftext,1)+1];
data(numSeqs,1).Header = '';

try
    for theSeq = 1:numSeqs
        % Check for > symbol ?
        data(theSeq).Header = ftext{seqStarts(theSeq)}(2:end);
        firstRow = seqStarts(theSeq)+1;
        lastRow = seqStarts(theSeq+1)-1;
        numChars = cellfun('length',ftext(firstRow:lastRow));
        numSymbols = sum(numChars);
        data(theSeq).Sequence = repmat(' ',1,numSymbols);
        pos = 1;
        for i=firstRow:lastRow,
            str = strtrim(ftext{i});
            len =  length(str);
            if len == 0
                break
            end
            data(theSeq).Sequence(pos:pos+len-1) = str;
            pos = pos+len;
        end
        data(theSeq).Sequence = strtrim(data(theSeq).Sequence);
        if ignoreGaps
            data(theSeq).Sequence = strrep(data(theSeq).Sequence,'-','');
            data(theSeq).Sequence = strrep(data(theSeq).Sequence,'.','');
        end
    end

    % trim headers
    if trimh
       for i = 1:numSeqs
          data(i).Header = sscanf(data(i).Header,'%s',1);
       end
    end
    
    % in case of two outputs
    if nargout == 2
        if numSeqs == 1
            seq = data.Sequence;
            data = data.Header;
        else
            seq = {data(:).Sequence};
            data = {data(:).Header};
        end
    end

catch allExceptions
    error('Bioinfo:fastaread:IncorrectDataFormat',...
        'Incorrect data format in fasta file')
end