From 4ca6f92ceb4b2f8c504431cf56f8a6135187a61c Mon Sep 17 00:00:00 2001 From: Calvin Date: Mon, 11 Mar 2013 12:36:42 -0400 Subject: adding matlab code --- src/matlab/fastaread.m | 232 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 src/matlab/fastaread.m (limited to 'src/matlab/fastaread.m') diff --git a/src/matlab/fastaread.m b/src/matlab/fastaread.m new file mode 100644 index 0000000..37d9b71 --- /dev/null +++ b/src/matlab/fastaread.m @@ -0,0 +1,232 @@ +function [data, seq] = fastaread(filename,varargin) +%FASTAREAD reads FASTA format file. +% +% S = FASTAREAD(FILENAME) reads a FASTA format file FILENAME, returning +% the data in the file as a structure. FILENAME can also be a URL or +% MATLAB character array that contains the text of a FASTA format file. +% S.Header is the header information. S.Sequence is the sequence stored +% as a string of characters. +% +% [HEADER, SEQ] = FASTAREAD(FILENAME) reads the file into separate +% variables HEADER and SEQ. If the file contains more than one sequence, +% then HEADER and SEQ are cell arrays of header and sequence information. +% +% FASTAREAD(...,'IGNOREGAPS',TF) removes any gap symbol ('-' or '.') +% from the sequence(s) when TF is true. Default is false. +% +% FASTAREAD(...,'BLOCKREAD', M) allows you to read in a single entry or +% block of entries from a file containing multiple sequences. If M is a +% scalar then the M'th entry in the file is read. If M is a two element +% vector then the block of entries starting at entry M(1) and ending at +% entry M(2) will be read. Use Inf for M(2) to read all entries in the +% file starting at position M(1). +% +% FASTAREAD(...,'TRIMHEADERS',TF) trims the header after the first +% whitespace when TF is true. White space characters include a horizontal +% tab (char(9)) and a space (char(32)). Default is false. +% +% FASTA format specified here: +% http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml +% +% Examples: +% +% % Read the sequence for the human p53 tumor gene. +% p53nt = fastaread('p53nt.txt') +% +% % Read the sequence for the human p53 tumor protein. +% p53aa = fastaread('p53aa.txt') +% +% % Read a block of entries from a file +% pf2_5_10 = fastaread('pf00002.fa','blockread',[ 5 10], ... +% 'ignoregaps',true) +% +% % Read the human mitochondrion genome in FASTA format. +% entrezSite = 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?' +% textOptions = '&txt=on&view=fasta' +% genbankID = '&list_uids=NC_012920' +% mitochondrion = fastaread([entrezSite textOptions genbankID]) +% +% See also EMBLREAD, FASTAINFO, FASTAWRITE, FASTQINFO, FASTQREAD, +% FASTQWRITE, GENBANKREAD, GENPEPTREAD, HMMPROFDEMO, MULTIALIGNREAD, +% MULTIALIGNVIEWER, SEQPROFILE, SEQTOOL, SFFINFO, SFFREAD. + +% Copyright 2002-2010 The MathWorks, Inc. +% $Revision: 1.15.4.21 $ $Date: 2010/06/26 04:54:38 $ + +% check input is char +% in a future version we may accept also cells +if ~ischar(filename) + error('Bioinfo:fastaread:InvalidInput','Input must be a character array') +end + +% default +ignoreGaps = false; +trimh = false; +blockRead = false; +% get input arguments +if nargin > 1 + if rem(nargin,2) ~= 1 + error('Bioinfo:fastaread:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'ignoregaps','blockread','trimheaders'}; + for j=1:2:nargin-1 + pname = varargin{j}; + pval = varargin{j+1}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('Bioinfo:fastaread:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + %elseif length(k)>1 + % error('Bioinfo:fastaread:AmbiguousParameterName',... + % 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % ignore gaps + ignoreGaps = opttf(pval); + if isempty(ignoreGaps) + error('Bioinfo:fastaread:IgnoreGapsNotLogical',... + '%s must be a logical value, true or false.',... + upper(char(okargs(k)))); + end + case 2 % range + range = pval; + if ~isnumeric(range) || numel(range)> 2 || isempty(range) + error('Bioinfo:fastaread:BadBlockRange',... + 'BlockRange must be numeric scalar or vector with two values.') + end + blockRead = true; + range = sort(range); + case 3 % trimheaders + trimh = opttf(pval, okargs{k}, mfilename); + end + end + end +end + + +if size(filename,1)>1 % is padded string + if blockRead + warning('Bioinfo:fastaread:IgnoredRange',... + 'BLOCKREAD is only allowed when parsing a file.') + end + ftext = cell(size(filename,1),1); + for i=1:size(filename,1) + ftext(i,1)=strtrim(strread(filename(i,:),'%s','whitespace','','delimiter','\n')); + end + % try then if it is an url +elseif (strfind(filename(1:min(10,end)), '://')) + if blockRead + warning('Bioinfo:fastaread:IgnoredRange',... + 'BLOCKREAD is only allowed when parsing a file.') + end + if (~usejava('jvm')) + error('Bioinfo:fastaread:NoJava',... + 'Reading from a URL requires Java.') + end + try + ftext = urlread(filename); + catch allExceptions + error('Bioinfo:fastaread:CannotReadURL',... + 'Cannot read URL "%s".', filename); + end + ftext = strread(ftext,'%s','delimiter','\n'); + + % try then if it is a valid filename +elseif (exist(filename,'file') || exist(fullfile(pwd,filename),'file')) + if blockRead + blockText = getfileblock(filename,range,'>'); + try + ftext = strread(blockText,'%s','delimiter','\n'); + catch theErr + if strcmpi(theErr.identifier,'MATLAB:nomem') + error( 'Bioinfo:fastaread:BlockTooBig',... + 'The block is too large to fit in memory. Try a smaller range.'); + else + rethrow(theErr); + end + end + else %=== read entire file + fid = fopen(filename); + c = onCleanup(@()fclose(fid)); + try + ftext = textscan(fid,'%s','delimiter','\n','bufsize',10000000); + ftext = ftext{:}; + catch theErr + if strcmpi(theErr.identifier,'MATLAB:nomem') + error( 'Bioinfo:fastaread:FileTooBig',... + 'The file is too large to fit in memory. Try using the BLOCKREAD option to read the file in blocks.'); + else + rethrow(theErr); + end + end + end +else % must be a string with '\n', convert to cell + if blockRead + warning('Bioinfo:fastaread:IgnoredRange',... + 'BLOCKREAD is only allowed when parsing a file.') + end + ftext = strread(filename,'%s','delimiter','\n'); +end + +% it is possible that there will be multiple sequences +commentLines = strncmp(ftext,'>',1); + +if ~any(commentLines) + error('Bioinfo:fastaread:FastaNotValid',... + 'Input does not exist or is not a valid FASTA file.') +end + +numSeqs = sum(commentLines); +seqStarts = [find(commentLines); size(ftext,1)+1]; +data(numSeqs,1).Header = ''; + +try + for theSeq = 1:numSeqs + % Check for > symbol ? + data(theSeq).Header = ftext{seqStarts(theSeq)}(2:end); + firstRow = seqStarts(theSeq)+1; + lastRow = seqStarts(theSeq+1)-1; + numChars = cellfun('length',ftext(firstRow:lastRow)); + numSymbols = sum(numChars); + data(theSeq).Sequence = repmat(' ',1,numSymbols); + pos = 1; + for i=firstRow:lastRow, + str = strtrim(ftext{i}); + len = length(str); + if len == 0 + break + end + data(theSeq).Sequence(pos:pos+len-1) = str; + pos = pos+len; + end + data(theSeq).Sequence = strtrim(data(theSeq).Sequence); + if ignoreGaps + data(theSeq).Sequence = strrep(data(theSeq).Sequence,'-',''); + data(theSeq).Sequence = strrep(data(theSeq).Sequence,'.',''); + end + end + + % trim headers + if trimh + for i = 1:numSeqs + data(i).Header = sscanf(data(i).Header,'%s',1); + end + end + + % in case of two outputs + if nargout == 2 + if numSeqs == 1 + seq = data.Sequence; + data = data.Header; + else + seq = {data(:).Sequence}; + data = {data(:).Header}; + end + end + +catch allExceptions + error('Bioinfo:fastaread:IncorrectDataFormat',... + 'Incorrect data format in fasta file') +end + -- cgit v1.2.3