summaryrefslogtreecommitdiff
path: root/src/nbc/fasta.sml
diff options
context:
space:
mode:
Diffstat (limited to 'src/nbc/fasta.sml')
-rw-r--r--src/nbc/fasta.sml326
1 files changed, 326 insertions, 0 deletions
diff --git a/src/nbc/fasta.sml b/src/nbc/fasta.sml
new file mode 100644
index 0000000..8089025
--- /dev/null
+++ b/src/nbc/fasta.sml
@@ -0,0 +1,326 @@
+signature FILE = sig
+ type argument
+ type file
+ type read
+ type nmer
+ type result
+ val startFile: argument -> file
+ val startRead: file * string -> read
+ val nmer: file * read * nmer -> unit
+ val stopRead: file * read -> unit
+ val stopFile: file -> result
+ val invalidFormat: file -> result
+end
+
+signature FASTA = sig
+ type argument
+ type result
+ val process: argument * TextIO.instream -> result
+end
+
+functor AgnosticFasta (
+ structure Nmer: NMER
+ structure File: FILE
+ sharing type Nmer.nmer = File.nmer
+ structure Sides: sig
+ include NMER_SIDES
+ type file
+ type read
+ val process: file * read * sides -> unit
+ end
+ sharing type Nmer.base = Sides.sidesBase
+ sharing type Nmer.nmer = Sides.sidesNmer
+ sharing type File.read = Sides.read
+ sharing type File.file = Sides.file
+) :> FASTA
+ where type argument = File.argument
+ where type result = File.result
+= struct
+ type argument = File.argument
+ type result = File.result
+
+ val beforeHeaderBeginningOfLine = ParseState.create ()
+ val beforeHeaderMiddleOfLine = ParseState.create ()
+ val afterHeaderBeginningOfLine = ParseState.create ()
+ val afterHeaderMiddleOfLine = ParseState.create ()
+
+ fun inputLineButDiscardNewline instream =
+ Option.map (fn line =>
+ String.extract (line, 0, SOME (size line - 1))
+ ) (TextIO.inputLine instream)
+ datatype z = datatype ParseState.whichCharacters (* This | Any *)
+
+ local
+ fun header (instream, (file, sides)) =
+ case inputLineButDiscardNewline instream of
+ NONE => File.invalidFormat file
+ | SOME header => ParseState.enter (
+ afterHeaderBeginningOfLine
+ , instream
+ , (
+ file
+ , File.startRead (
+ file
+ , header
+ ), sides
+ )
+ )
+ fun space (instream, (file, sides)) = ParseState.enter (
+ beforeHeaderMiddleOfLine
+ , instream
+ , (file, sides)
+ )
+ fun newline (instream, (file, sides)) = ParseState.enter (
+ beforeHeaderBeginningOfLine
+ , instream
+ , (file, sides)
+ )
+ fun invalidFormat (_, (file, _)) = File.invalidFormat file
+ in
+ val () = ParseState.build {
+ state = beforeHeaderBeginningOfLine
+ , characters = [
+ (These [#">"], header)
+ , (These [#"\n"], newline)
+ , (These [#" ", #"\t", #"\r"], space)
+ , (Any, invalidFormat)
+ ], endOfFile = invalidFormat
+ }
+ val () = ParseState.build {
+ state = beforeHeaderMiddleOfLine
+ , characters = [
+ (These [#"\n"], newline)
+ , (These [#" ", #"\t", #"\r"], space)
+ , (Any, invalidFormat)
+ ], endOfFile = invalidFormat
+ }
+ end
+ local
+ fun base base (instream, (file, read, sides)) = (
+ Sides.put (sides, base)
+ ;
+ if Sides.isFull sides then
+ Sides.process (file, read, sides)
+ else ()
+ ; ParseState.enter (
+ afterHeaderMiddleOfLine
+ , instream
+ , (file, read, sides)
+ )
+ )
+ fun space (instream, (file, read, sides)) = (
+ ParseState.enter (
+ afterHeaderMiddleOfLine
+ , instream
+ , (file, read, sides)
+ )
+ )
+ fun other (instream, (file, read, sides)) = (
+ Sides.clear sides
+ ; ParseState.enter (
+ afterHeaderMiddleOfLine
+ , instream
+ , (file, read, sides)
+ )
+ )
+ fun newline (instream, (file, read, sides)) =
+ ParseState.enter (
+ afterHeaderBeginningOfLine
+ , instream
+ , (file, read, sides)
+ )
+ fun header (instream, (file, read, sides)) = (
+ File.stopRead (file, read)
+ ; Sides.clear sides
+ ; case inputLineButDiscardNewline instream of
+ NONE => File.invalidFormat file
+ | SOME header => ParseState.enter (
+ afterHeaderBeginningOfLine
+ , instream
+ , (
+ file
+ , File.startRead (
+ file
+ , header
+ ), sides
+ )
+ )
+ )
+ fun success (_, (file, read, _)) = (
+ File.stopRead (file, read)
+ ; File.stopFile file
+ )
+ in
+ val () = ParseState.build {
+ state = afterHeaderBeginningOfLine
+ , characters = [
+ (These [#"A", #"a"], base Nmer.a)
+ , (These [#"C", #"c"], base Nmer.c)
+ , (These [#"G", #"g"], base Nmer.g)
+ , (These [#"T", #"t"], base Nmer.t)
+ , (These [#">"], header)
+ , (These [#"\n"], newline)
+ , (These [#" ", #"\t", #"\r"], space)
+ , (Any, other)
+ ], endOfFile = success
+ }
+ val () = ParseState.build {
+ state = afterHeaderMiddleOfLine
+ , characters = [
+ (These [#"A", #"a"], base Nmer.a)
+ , (These [#"C", #"c"], base Nmer.c)
+ , (These [#"G", #"g"], base Nmer.g)
+ , (These [#"T", #"t"], base Nmer.t)
+ , (These [#" ", #"\t", #"\r"], space)
+ , (These [#"\n"], newline)
+ , (Any, other)
+ ], endOfFile = success
+ }
+ end
+ fun process (argument, instream) = ParseState.enter (
+ beforeHeaderBeginningOfLine
+ , instream
+ , (File.startFile argument, Sides.create ())
+ )
+end
+
+functor SingleSidedFasta (
+ structure Nmer: NMER
+ structure File: FILE
+ sharing type Nmer.nmer = File.nmer
+) = AgnosticFasta (
+ structure Nmer = Nmer
+ structure File = File
+ structure Sides = struct
+ type read = File.read
+ type file = File.file
+ open Nmer.Single
+ fun process (file, read, sides) =
+ File.nmer (file, read, forward sides)
+ end
+)
+
+functor DoubleSidedFasta (
+ structure Nmer: NMER
+ structure File: FILE
+ sharing type Nmer.nmer = File.nmer
+) = AgnosticFasta (
+ structure Nmer = Nmer
+ structure File = File
+ structure Sides = struct
+ type read = File.read
+ type file = File.file
+ open Nmer.Double
+ fun process (file, read, sides) = (
+ File.nmer (file, read, forward sides)
+ ; File.nmer (file, read, reverse sides)
+ )
+ end
+)
+
+functor TestFile (Nmer: NMER) = struct
+ type argument = unit
+ type nmer = Nmer.nmer
+ type read = {header: string, nmers: nmer list ref}
+ type file = {header: string, nmers: string list} list ref
+ type result = string
+ fun startFile () = ref nil
+ fun stopFile file = String.concatWith ";" (
+ map (fn {header, nmers} =>
+ header
+ ^ ":"
+ ^ String.concatWith "," (rev nmers)
+ ) (rev (!file))
+ )
+ fun startRead (_, header) =
+ {header = header, nmers = ref nil}
+ fun nmer (_, {header = _, nmers}, nmer) =
+ nmers := nmer :: !nmers
+ fun stopRead (file, {header, nmers = ref nmers}) =
+ file := {
+ header = header
+ , nmers = map Nmer.toString nmers
+ } :: !file
+ fun invalidFormat _ = "invalid format"
+end
+
+functor Test () = struct
+ structure Nmer1 = Nmer (
+ val order = 1
+ structure Word = Word32
+ )
+ structure File1 = TestFile (Nmer1)
+ structure SingleFasta1 = SingleSidedFasta (
+ structure Nmer = Nmer1
+ structure File = File1
+ )
+ fun test process input () = process ((), TextIO.openString input)
+ val single1 = test SingleFasta1.process
+ structure Nmer2 = Nmer (
+ val order = 2
+ structure Word = Word32
+ )
+ structure File2 = TestFile (Nmer2)
+ structure SingleFasta2 = SingleSidedFasta (
+ structure Nmer = Nmer2
+ structure File = File2
+ )
+ val single2 = test SingleFasta2.process
+ structure DoubleFasta1 = DoubleSidedFasta (
+ structure Nmer = Nmer1
+ structure File = File1
+ )
+ val double1 = test DoubleFasta1.process
+ structure DoubleFasta2 = DoubleSidedFasta (
+ structure Nmer = Nmer2
+ structure File = File2
+ )
+ val double2 = test DoubleFasta2.process
+ val () = Test.list [
+ {
+ description = "single 1: A"
+ , function = single1 ">foo\nA\n"
+ , expectedResult = "foo:A"
+ }, {
+ description = "single 1: AG"
+ , function = single1 ">foo\nAG\n"
+ , expectedResult = "foo:A,G"
+ }, {
+ description = "single 2: A"
+ , function = single2 ">foo\nA\n"
+ , expectedResult = "foo:"
+ }, {
+ description = "single 2: CTGAG"
+ , function = single2 ">foo\nCTGAG\n"
+ , expectedResult = "foo:CT,TG,GA,AG"
+ }, {
+ description = "double 1: C"
+ , function = double1 ">bar\nC\n"
+ , expectedResult = "bar:C,G"
+ }, {
+ description = "double 2: T"
+ , function = double2 ">baz\nT\n"
+ , expectedResult = "baz:"
+ }, {
+ description = "double 2: GC"
+ , function = double2 ">quux\nGC\n"
+ , expectedResult = "quux:GC,GC"
+ }, {
+ description = "double 2: CCC\\nC\\nCT"
+ , function = double2 ">goo\nCCC\nC\nCT\n"
+ , expectedResult = "goo:CC,GG,CC,GG,CC,GG,CC,GG,CT,AG"
+ }, {
+ description = "double 2: CC\\nC*\\nT"
+ , function = double2 ">goo\nCC\nC*\nT\n"
+ , expectedResult = "goo:CC,GG,CC,GG"
+ }, {
+ description = "double 2: foo CATGAC goo TACCAG"
+ , function = double2
+ ">foo\nCATGAC\n>goo\nTACCAG\n"
+ , expectedResult = (
+ "foo:CA,TG,AT,AT,TG,CA,GA,TC,AC,GT"
+ ^ ";goo:TA,TA,AC,GT,CC,GG,CA,TG,AG,CT"
+ )
+ }
+ ]
+end