From b632667ce57af89691407bb8668e1512775278ae Mon Sep 17 00:00:00 2001 From: Calvin Date: Fri, 15 Mar 2013 15:26:20 -0400 Subject: nbc added --- src/nbc/fasta.sml | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 src/nbc/fasta.sml (limited to 'src/nbc/fasta.sml') diff --git a/src/nbc/fasta.sml b/src/nbc/fasta.sml new file mode 100644 index 0000000..8089025 --- /dev/null +++ b/src/nbc/fasta.sml @@ -0,0 +1,326 @@ +signature FILE = sig + type argument + type file + type read + type nmer + type result + val startFile: argument -> file + val startRead: file * string -> read + val nmer: file * read * nmer -> unit + val stopRead: file * read -> unit + val stopFile: file -> result + val invalidFormat: file -> result +end + +signature FASTA = sig + type argument + type result + val process: argument * TextIO.instream -> result +end + +functor AgnosticFasta ( + structure Nmer: NMER + structure File: FILE + sharing type Nmer.nmer = File.nmer + structure Sides: sig + include NMER_SIDES + type file + type read + val process: file * read * sides -> unit + end + sharing type Nmer.base = Sides.sidesBase + sharing type Nmer.nmer = Sides.sidesNmer + sharing type File.read = Sides.read + sharing type File.file = Sides.file +) :> FASTA + where type argument = File.argument + where type result = File.result += struct + type argument = File.argument + type result = File.result + + val beforeHeaderBeginningOfLine = ParseState.create () + val beforeHeaderMiddleOfLine = ParseState.create () + val afterHeaderBeginningOfLine = ParseState.create () + val afterHeaderMiddleOfLine = ParseState.create () + + fun inputLineButDiscardNewline instream = + Option.map (fn line => + String.extract (line, 0, SOME (size line - 1)) + ) (TextIO.inputLine instream) + datatype z = datatype ParseState.whichCharacters (* This | Any *) + + local + fun header (instream, (file, sides)) = + case inputLineButDiscardNewline instream of + NONE => File.invalidFormat file + | SOME header => ParseState.enter ( + afterHeaderBeginningOfLine + , instream + , ( + file + , File.startRead ( + file + , header + ), sides + ) + ) + fun space (instream, (file, sides)) = ParseState.enter ( + beforeHeaderMiddleOfLine + , instream + , (file, sides) + ) + fun newline (instream, (file, sides)) = ParseState.enter ( + beforeHeaderBeginningOfLine + , instream + , (file, sides) + ) + fun invalidFormat (_, (file, _)) = File.invalidFormat file + in + val () = ParseState.build { + state = beforeHeaderBeginningOfLine + , characters = [ + (These [#">"], header) + , (These [#"\n"], newline) + , (These [#" ", #"\t", #"\r"], space) + , (Any, invalidFormat) + ], endOfFile = invalidFormat + } + val () = ParseState.build { + state = beforeHeaderMiddleOfLine + , characters = [ + (These [#"\n"], newline) + , (These [#" ", #"\t", #"\r"], space) + , (Any, invalidFormat) + ], endOfFile = invalidFormat + } + end + local + fun base base (instream, (file, read, sides)) = ( + Sides.put (sides, base) + ; + if Sides.isFull sides then + Sides.process (file, read, sides) + else () + ; ParseState.enter ( + afterHeaderMiddleOfLine + , instream + , (file, read, sides) + ) + ) + fun space (instream, (file, read, sides)) = ( + ParseState.enter ( + afterHeaderMiddleOfLine + , instream + , (file, read, sides) + ) + ) + fun other (instream, (file, read, sides)) = ( + Sides.clear sides + ; ParseState.enter ( + afterHeaderMiddleOfLine + , instream + , (file, read, sides) + ) + ) + fun newline (instream, (file, read, sides)) = + ParseState.enter ( + afterHeaderBeginningOfLine + , instream + , (file, read, sides) + ) + fun header (instream, (file, read, sides)) = ( + File.stopRead (file, read) + ; Sides.clear sides + ; case inputLineButDiscardNewline instream of + NONE => File.invalidFormat file + | SOME header => ParseState.enter ( + afterHeaderBeginningOfLine + , instream + , ( + file + , File.startRead ( + file + , header + ), sides + ) + ) + ) + fun success (_, (file, read, _)) = ( + File.stopRead (file, read) + ; File.stopFile file + ) + in + val () = ParseState.build { + state = afterHeaderBeginningOfLine + , characters = [ + (These [#"A", #"a"], base Nmer.a) + , (These [#"C", #"c"], base Nmer.c) + , (These [#"G", #"g"], base Nmer.g) + , (These [#"T", #"t"], base Nmer.t) + , (These [#">"], header) + , (These [#"\n"], newline) + , (These [#" ", #"\t", #"\r"], space) + , (Any, other) + ], endOfFile = success + } + val () = ParseState.build { + state = afterHeaderMiddleOfLine + , characters = [ + (These [#"A", #"a"], base Nmer.a) + , (These [#"C", #"c"], base Nmer.c) + , (These [#"G", #"g"], base Nmer.g) + , (These [#"T", #"t"], base Nmer.t) + , (These [#" ", #"\t", #"\r"], space) + , (These [#"\n"], newline) + , (Any, other) + ], endOfFile = success + } + end + fun process (argument, instream) = ParseState.enter ( + beforeHeaderBeginningOfLine + , instream + , (File.startFile argument, Sides.create ()) + ) +end + +functor SingleSidedFasta ( + structure Nmer: NMER + structure File: FILE + sharing type Nmer.nmer = File.nmer +) = AgnosticFasta ( + structure Nmer = Nmer + structure File = File + structure Sides = struct + type read = File.read + type file = File.file + open Nmer.Single + fun process (file, read, sides) = + File.nmer (file, read, forward sides) + end +) + +functor DoubleSidedFasta ( + structure Nmer: NMER + structure File: FILE + sharing type Nmer.nmer = File.nmer +) = AgnosticFasta ( + structure Nmer = Nmer + structure File = File + structure Sides = struct + type read = File.read + type file = File.file + open Nmer.Double + fun process (file, read, sides) = ( + File.nmer (file, read, forward sides) + ; File.nmer (file, read, reverse sides) + ) + end +) + +functor TestFile (Nmer: NMER) = struct + type argument = unit + type nmer = Nmer.nmer + type read = {header: string, nmers: nmer list ref} + type file = {header: string, nmers: string list} list ref + type result = string + fun startFile () = ref nil + fun stopFile file = String.concatWith ";" ( + map (fn {header, nmers} => + header + ^ ":" + ^ String.concatWith "," (rev nmers) + ) (rev (!file)) + ) + fun startRead (_, header) = + {header = header, nmers = ref nil} + fun nmer (_, {header = _, nmers}, nmer) = + nmers := nmer :: !nmers + fun stopRead (file, {header, nmers = ref nmers}) = + file := { + header = header + , nmers = map Nmer.toString nmers + } :: !file + fun invalidFormat _ = "invalid format" +end + +functor Test () = struct + structure Nmer1 = Nmer ( + val order = 1 + structure Word = Word32 + ) + structure File1 = TestFile (Nmer1) + structure SingleFasta1 = SingleSidedFasta ( + structure Nmer = Nmer1 + structure File = File1 + ) + fun test process input () = process ((), TextIO.openString input) + val single1 = test SingleFasta1.process + structure Nmer2 = Nmer ( + val order = 2 + structure Word = Word32 + ) + structure File2 = TestFile (Nmer2) + structure SingleFasta2 = SingleSidedFasta ( + structure Nmer = Nmer2 + structure File = File2 + ) + val single2 = test SingleFasta2.process + structure DoubleFasta1 = DoubleSidedFasta ( + structure Nmer = Nmer1 + structure File = File1 + ) + val double1 = test DoubleFasta1.process + structure DoubleFasta2 = DoubleSidedFasta ( + structure Nmer = Nmer2 + structure File = File2 + ) + val double2 = test DoubleFasta2.process + val () = Test.list [ + { + description = "single 1: A" + , function = single1 ">foo\nA\n" + , expectedResult = "foo:A" + }, { + description = "single 1: AG" + , function = single1 ">foo\nAG\n" + , expectedResult = "foo:A,G" + }, { + description = "single 2: A" + , function = single2 ">foo\nA\n" + , expectedResult = "foo:" + }, { + description = "single 2: CTGAG" + , function = single2 ">foo\nCTGAG\n" + , expectedResult = "foo:CT,TG,GA,AG" + }, { + description = "double 1: C" + , function = double1 ">bar\nC\n" + , expectedResult = "bar:C,G" + }, { + description = "double 2: T" + , function = double2 ">baz\nT\n" + , expectedResult = "baz:" + }, { + description = "double 2: GC" + , function = double2 ">quux\nGC\n" + , expectedResult = "quux:GC,GC" + }, { + description = "double 2: CCC\\nC\\nCT" + , function = double2 ">goo\nCCC\nC\nCT\n" + , expectedResult = "goo:CC,GG,CC,GG,CC,GG,CC,GG,CT,AG" + }, { + description = "double 2: CC\\nC*\\nT" + , function = double2 ">goo\nCC\nC*\nT\n" + , expectedResult = "goo:CC,GG,CC,GG" + }, { + description = "double 2: foo CATGAC goo TACCAG" + , function = double2 + ">foo\nCATGAC\n>goo\nTACCAG\n" + , expectedResult = ( + "foo:CA,TG,AT,AT,TG,CA,GA,TC,AC,GT" + ^ ";goo:TA,TA,AC,GT,CC,GG,CA,TG,AG,CT" + ) + } + ] +end -- cgit v1.2.3