adding matlab code

author: Calvin <calvin@EESI> 2013-03-11 12:36:42 -0400
committer: Calvin <calvin@EESI> 2013-03-11 12:36:42 -0400
commit: 4ca6f92ceb4b2f8c504431cf56f8a6135187a61c (patch)
tree: bda30f1f0710d19660c04c57f31be37cf4865c50 /src/matlab/fastaread.m
parent: c18793c210ea44cef63db3568791cbad537bdb79 (diff)
1 files changed, 232 insertions, 0 deletions
diff --git a/src/matlab/fastaread.m b/src/matlab/fastaread.m
new file mode 100644
index 0000000..37d9b71
--- /dev/null
+++ b/src/matlab/fastaread.m
@@ -0,0 +1,232 @@
+function [data, seq] = fastaread(filename,varargin)
+%FASTAREAD reads FASTA format file.
+%
+%   S = FASTAREAD(FILENAME) reads a FASTA format file FILENAME, returning
+%   the data in the file as a structure. FILENAME can also be a URL or
+%   MATLAB character array that contains the text of a FASTA format file.
+%   S.Header is the header information. S.Sequence is the sequence stored
+%   as a string of characters.
+%
+%   [HEADER, SEQ] = FASTAREAD(FILENAME) reads the file into separate
+%   variables HEADER and SEQ. If the file contains more than one sequence,
+%   then HEADER and SEQ are cell arrays of header and sequence information.
+%
+%   FASTAREAD(...,'IGNOREGAPS',TF) removes any gap symbol ('-' or '.')
+%   from the sequence(s) when TF is true. Default is false.
+%
+%   FASTAREAD(...,'BLOCKREAD', M) allows you to read in a single entry or
+%   block of entries from a file containing multiple sequences. If M is a
+%   scalar then the M'th entry in the file is read. If M is a two element
+%   vector then the block of entries starting at entry M(1) and ending at
+%   entry M(2) will be read.  Use Inf for M(2) to read all entries in the
+%   file starting at position M(1). 
+%
+%   FASTAREAD(...,'TRIMHEADERS',TF) trims the header after the first
+%   whitespace when TF is true. White space characters include a horizontal
+%   tab (char(9)) and a space (char(32)). Default is false. 
+%
+%   FASTA format specified here:
+%   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
+%
+%   Examples:
+%
+%       % Read the sequence for the human p53 tumor gene.
+%       p53nt = fastaread('p53nt.txt')
+%
+%       % Read the sequence for the human p53 tumor protein.
+%       p53aa = fastaread('p53aa.txt')
+%
+%       % Read a block of entries from a file
+%       pf2_5_10 = fastaread('pf00002.fa','blockread',[ 5 10], ...
+%                            'ignoregaps',true)
+%
+%       % Read the human mitochondrion genome in FASTA format.
+%       entrezSite = 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?'
+%       textOptions = '&txt=on&view=fasta'
+%       genbankID = '&list_uids=NC_012920'
+%       mitochondrion = fastaread([entrezSite textOptions genbankID])
+%
+%   See also EMBLREAD, FASTAINFO, FASTAWRITE, FASTQINFO, FASTQREAD,
+%   FASTQWRITE, GENBANKREAD, GENPEPTREAD, HMMPROFDEMO, MULTIALIGNREAD,
+%   MULTIALIGNVIEWER, SEQPROFILE, SEQTOOL, SFFINFO, SFFREAD.
+
+%   Copyright 2002-2010 The MathWorks, Inc.
+%   $Revision: 1.15.4.21 $  $Date: 2010/06/26 04:54:38 $
+
+% check input is char
+% in a future version we may accept also cells
+if ~ischar(filename)
+    error('Bioinfo:fastaread:InvalidInput','Input must be a character array')
+end
+
+% default
+ignoreGaps = false;
+trimh = false;
+blockRead = false;
+% get input arguments
+if  nargin > 1
+    if rem(nargin,2) ~= 1
+        error('Bioinfo:fastaread:IncorrectNumberOfArguments',...
+            'Incorrect number of arguments to %s.',mfilename);
+    end
+    okargs = {'ignoregaps','blockread','trimheaders'};
+    for j=1:2:nargin-1
+        pname = varargin{j};
+        pval = varargin{j+1};
+        k = find(strncmpi(pname,okargs,numel(pname)));
+        if isempty(k)
+            error('Bioinfo:fastaread:UnknownParameterName',...
+                'Unknown parameter name: %s.',pname);
+            %elseif length(k)>1
+            %    error('Bioinfo:fastaread:AmbiguousParameterName',...
+            %        'Ambiguous parameter name: %s.',pname);
+        else
+            switch(k)
+                case 1  % ignore gaps
+                    ignoreGaps = opttf(pval);
+                    if isempty(ignoreGaps)
+                        error('Bioinfo:fastaread:IgnoreGapsNotLogical',...
+                            '%s must be a logical value, true or false.',...
+                            upper(char(okargs(k))));
+                    end
+                case 2  % range
+                    range = pval;
+                    if ~isnumeric(range) || numel(range)> 2 || isempty(range)
+                        error('Bioinfo:fastaread:BadBlockRange',...
+                            'BlockRange must be numeric scalar or vector with two values.')
+                    end
+                    blockRead = true;
+                    range = sort(range);
+                case 3 % trimheaders
+			        trimh = opttf(pval, okargs{k}, mfilename);    
+            end
+        end
+    end
+end
+
+
+if size(filename,1)>1  % is padded string
+    if blockRead
+        warning('Bioinfo:fastaread:IgnoredRange',...
+            'BLOCKREAD is only allowed when parsing a file.')
+    end
+    ftext = cell(size(filename,1),1);
+    for i=1:size(filename,1)
+        ftext(i,1)=strtrim(strread(filename(i,:),'%s','whitespace','','delimiter','\n'));
+    end
+    % try then if it is an url
+elseif (strfind(filename(1:min(10,end)), '://'))
+    if blockRead
+        warning('Bioinfo:fastaread:IgnoredRange',...
+            'BLOCKREAD is only allowed when parsing a file.')
+    end
+    if (~usejava('jvm'))
+        error('Bioinfo:fastaread:NoJava',...
+            'Reading from a URL requires Java.')
+    end
+    try
+        ftext = urlread(filename);
+    catch allExceptions
+        error('Bioinfo:fastaread:CannotReadURL',...
+            'Cannot read URL "%s".', filename);
+    end
+    ftext = strread(ftext,'%s','delimiter','\n');
+
+    % try then if it is a valid filename
+elseif  (exist(filename,'file') || exist(fullfile(pwd,filename),'file'))
+    if blockRead
+        blockText = getfileblock(filename,range,'>');
+        try
+           ftext = strread(blockText,'%s','delimiter','\n');
+        catch theErr
+           if strcmpi(theErr.identifier,'MATLAB:nomem')
+                error( 'Bioinfo:fastaread:BlockTooBig',...
+                  'The block is too large to fit in memory. Try a smaller range.');
+           else
+                 rethrow(theErr);
+           end
+        end
+    else %=== read entire file
+        fid = fopen(filename);
+		c = onCleanup(@()fclose(fid));
+        try
+            ftext = textscan(fid,'%s','delimiter','\n','bufsize',10000000);
+            ftext = ftext{:}; 
+        catch theErr
+            if strcmpi(theErr.identifier,'MATLAB:nomem')
+                error( 'Bioinfo:fastaread:FileTooBig',...
+                    'The file is too large to fit in memory. Try using the BLOCKREAD option to read the file in blocks.');
+            else 
+                rethrow(theErr);
+            end
+        end
+    end
+else  % must be a string with '\n', convert to cell
+    if blockRead
+        warning('Bioinfo:fastaread:IgnoredRange',...
+            'BLOCKREAD is only allowed when parsing a file.')
+    end
+    ftext = strread(filename,'%s','delimiter','\n');
+end
+
+% it is possible that there will be multiple sequences
+commentLines = strncmp(ftext,'>',1);
+
+if ~any(commentLines)
+    error('Bioinfo:fastaread:FastaNotValid',...
+        'Input does not exist or is not a valid FASTA file.')
+end
+
+numSeqs = sum(commentLines);
+seqStarts = [find(commentLines); size(ftext,1)+1];
+data(numSeqs,1).Header = '';
+
+try
+    for theSeq = 1:numSeqs
+        % Check for > symbol ?
+        data(theSeq).Header = ftext{seqStarts(theSeq)}(2:end);
+        firstRow = seqStarts(theSeq)+1;
+        lastRow = seqStarts(theSeq+1)-1;
+        numChars = cellfun('length',ftext(firstRow:lastRow));
+        numSymbols = sum(numChars);
+        data(theSeq).Sequence = repmat(' ',1,numSymbols);
+        pos = 1;
+        for i=firstRow:lastRow,
+            str = strtrim(ftext{i});
+            len =  length(str);
+            if len == 0
+                break
+            end
+            data(theSeq).Sequence(pos:pos+len-1) = str;
+            pos = pos+len;
+        end
+        data(theSeq).Sequence = strtrim(data(theSeq).Sequence);
+        if ignoreGaps
+            data(theSeq).Sequence = strrep(data(theSeq).Sequence,'-','');
+            data(theSeq).Sequence = strrep(data(theSeq).Sequence,'.','');
+        end
+    end
+
+    % trim headers
+    if trimh
+       for i = 1:numSeqs
+          data(i).Header = sscanf(data(i).Header,'%s',1);
+       end
+    end
+    
+    % in case of two outputs
+    if nargout == 2
+        if numSeqs == 1
+            seq = data.Sequence;
+            data = data.Header;
+        else
+            seq = {data(:).Sequence};
+            data = {data(:).Header};
+        end
+    end
+
+catch allExceptions
+    error('Bioinfo:fastaread:IncorrectDataFormat',...
+        'Incorrect data format in fasta file')
+end
+
author	Calvin <calvin@EESI>	2013-03-11 12:36:42 -0400
committer	Calvin <calvin@EESI>	2013-03-11 12:36:42 -0400
commit	4ca6f92ceb4b2f8c504431cf56f8a6135187a61c (patch)
tree	bda30f1f0710d19660c04c57f31be37cf4865c50 /src/matlab/fastaread.m
parent	c18793c210ea44cef63db3568791cbad537bdb79 (diff)