aboutsummaryrefslogtreecommitdiff
path: root/src/nbc/fasta.sml
blob: 808902512403bf37d74f74124158bc4e5a862848 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
signature FILE = sig
	type argument
	type file
	type read
	type nmer
	type result
	val startFile: argument -> file
	val startRead: file * string -> read
	val nmer: file * read * nmer -> unit
	val stopRead: file * read -> unit
	val stopFile: file -> result
	val invalidFormat: file -> result
end

signature FASTA = sig
	type argument
	type result
	val process: argument * TextIO.instream -> result
end

functor AgnosticFasta (
	structure Nmer: NMER
	structure File: FILE
		sharing type Nmer.nmer = File.nmer
	structure Sides: sig
		include NMER_SIDES
		type file
		type read
		val process: file * read * sides -> unit
	end
		sharing type Nmer.base = Sides.sidesBase
		sharing type Nmer.nmer = Sides.sidesNmer
		sharing type File.read = Sides.read
		sharing type File.file = Sides.file
) :> FASTA
	where type argument = File.argument
	where type result = File.result
= struct
	type argument = File.argument
	type result = File.result

	val beforeHeaderBeginningOfLine = ParseState.create ()
	val beforeHeaderMiddleOfLine = ParseState.create ()
	val afterHeaderBeginningOfLine = ParseState.create ()
	val afterHeaderMiddleOfLine = ParseState.create ()

	fun inputLineButDiscardNewline instream =
		Option.map (fn line =>
			String.extract (line, 0, SOME (size line - 1))
		) (TextIO.inputLine instream)
	datatype z = datatype ParseState.whichCharacters (* This | Any *)

	local
		fun header (instream, (file, sides)) =
			case inputLineButDiscardNewline instream of
				NONE => File.invalidFormat file
				| SOME header => ParseState.enter (
					afterHeaderBeginningOfLine
					, instream
					, (
						file
						, File.startRead (
							file
							, header
						), sides
					)
				)
		fun space (instream, (file, sides)) = ParseState.enter (
			beforeHeaderMiddleOfLine
			, instream
			, (file, sides)
		)
		fun newline (instream, (file, sides)) = ParseState.enter (
			beforeHeaderBeginningOfLine
			, instream
			, (file, sides)
		)
		fun invalidFormat (_, (file, _)) = File.invalidFormat file
	in
		val () = ParseState.build {
			state = beforeHeaderBeginningOfLine
			, characters = [
				(These [#">"], header)
				, (These [#"\n"], newline)
				, (These [#" ", #"\t", #"\r"], space)
				, (Any, invalidFormat)
			], endOfFile = invalidFormat
		}
		val () = ParseState.build {
			state = beforeHeaderMiddleOfLine
			, characters = [
				(These [#"\n"], newline)
				, (These [#" ", #"\t", #"\r"], space)
				, (Any, invalidFormat)
			], endOfFile = invalidFormat
		}
	end
	local
		fun base base (instream, (file, read, sides)) = (
			Sides.put (sides, base)
			;
				if Sides.isFull sides then
					Sides.process (file, read, sides)
				else ()
			; ParseState.enter (
				afterHeaderMiddleOfLine
				, instream
				, (file, read, sides)
			)
		)
		fun space (instream, (file, read, sides)) = (
			ParseState.enter (
				afterHeaderMiddleOfLine
				, instream
				, (file, read, sides)
			)
		)
		fun other (instream, (file, read, sides)) = (
			Sides.clear sides
			; ParseState.enter (
				afterHeaderMiddleOfLine
				, instream
				, (file, read, sides)
			)
		)
		fun newline (instream, (file, read, sides)) =
			ParseState.enter (
				afterHeaderBeginningOfLine
				, instream
				, (file, read, sides)
			)
		fun header (instream, (file, read, sides)) = (
			File.stopRead (file, read)
			; Sides.clear sides
			; case inputLineButDiscardNewline instream of
				NONE => File.invalidFormat file
				| SOME header => ParseState.enter (
					afterHeaderBeginningOfLine
					, instream
					, (
						file
						, File.startRead (
							file
							, header
						), sides
					)
				)
		)
		fun success (_, (file, read, _)) = (
			File.stopRead (file, read)
			; File.stopFile file
		)
	in
		val () = ParseState.build {
			state = afterHeaderBeginningOfLine
			, characters = [
				(These [#"A", #"a"], base Nmer.a)
				, (These [#"C", #"c"], base Nmer.c)
				, (These [#"G", #"g"], base Nmer.g)
				, (These [#"T", #"t"], base Nmer.t)
				, (These [#">"], header)
				, (These [#"\n"], newline)
				, (These [#" ", #"\t", #"\r"], space)
				, (Any, other)
			], endOfFile = success
		}
		val () = ParseState.build {
			state = afterHeaderMiddleOfLine
			, characters = [
				(These [#"A", #"a"], base Nmer.a)
				, (These [#"C", #"c"], base Nmer.c)
				, (These [#"G", #"g"], base Nmer.g)
				, (These [#"T", #"t"], base Nmer.t)
				, (These [#" ", #"\t", #"\r"], space)
				, (These [#"\n"], newline)
				, (Any, other)
			], endOfFile = success
		}
	end
	fun process (argument, instream) = ParseState.enter (
		beforeHeaderBeginningOfLine
		, instream
		, (File.startFile argument, Sides.create ())
	)
end

functor SingleSidedFasta (
	structure Nmer: NMER
	structure File: FILE
		sharing type Nmer.nmer = File.nmer
) = AgnosticFasta (
	structure Nmer = Nmer
	structure File = File
	structure Sides = struct
		type read = File.read
		type file = File.file
		open Nmer.Single
		fun process (file, read, sides) =
			File.nmer (file, read, forward sides)
	end
)

functor DoubleSidedFasta (
	structure Nmer: NMER
	structure File: FILE
		sharing type Nmer.nmer = File.nmer
) = AgnosticFasta (
	structure Nmer = Nmer
	structure File = File
	structure Sides = struct
		type read = File.read
		type file = File.file
		open Nmer.Double
		fun process (file, read, sides) = (
			File.nmer (file, read, forward sides)
			; File.nmer (file, read, reverse sides)
		)
	end
)

functor TestFile (Nmer: NMER) = struct
	type argument = unit
	type nmer = Nmer.nmer
	type read = {header: string, nmers: nmer list ref}
	type file = {header: string, nmers: string list} list ref
	type result = string
	fun startFile () = ref nil
	fun stopFile file = String.concatWith ";" (
		map (fn {header, nmers} =>
			header
			^ ":"
			^ String.concatWith "," (rev nmers)
		) (rev (!file))
	)
	fun startRead (_, header) =
		{header = header, nmers = ref nil}
	fun nmer (_, {header = _, nmers}, nmer) =
		nmers := nmer :: !nmers
	fun stopRead (file, {header, nmers = ref nmers}) =
		file := {
			header = header
			, nmers = map Nmer.toString nmers
		} :: !file
	fun invalidFormat _ = "invalid format"
end

functor Test () = struct
	structure Nmer1 = Nmer (
		val order = 1
		structure Word = Word32
	)
	structure File1 = TestFile (Nmer1)
	structure SingleFasta1 = SingleSidedFasta (
		structure Nmer = Nmer1
		structure File = File1
	)
	fun test process input () = process ((), TextIO.openString input)
	val single1 = test SingleFasta1.process
	structure Nmer2 = Nmer (
		val order = 2
		structure Word = Word32
	)
	structure File2 = TestFile (Nmer2)
	structure SingleFasta2 = SingleSidedFasta (
		structure Nmer = Nmer2
		structure File = File2
	)
	val single2 = test SingleFasta2.process
	structure DoubleFasta1 = DoubleSidedFasta (
		structure Nmer = Nmer1
		structure File = File1
	)
	val double1 = test DoubleFasta1.process
	structure DoubleFasta2 = DoubleSidedFasta (
		structure Nmer = Nmer2
		structure File = File2
	)
	val double2 = test DoubleFasta2.process
	val () = Test.list [
		{
			description = "single 1: A"
			, function = single1 ">foo\nA\n"
			, expectedResult = "foo:A"
		}, {
			description = "single 1: AG"
			, function = single1 ">foo\nAG\n"
			, expectedResult = "foo:A,G"
		}, {
			description = "single 2: A"
			, function = single2 ">foo\nA\n"
			, expectedResult = "foo:"
		}, {
			description = "single 2: CTGAG"
			, function = single2 ">foo\nCTGAG\n"
			, expectedResult = "foo:CT,TG,GA,AG"
		}, {
			description = "double 1: C"
			, function = double1 ">bar\nC\n"
			, expectedResult = "bar:C,G"
		}, {
			description = "double 2: T"
			, function = double2 ">baz\nT\n"
			, expectedResult = "baz:"
		}, {
			description = "double 2: GC"
			, function = double2 ">quux\nGC\n"
			, expectedResult = "quux:GC,GC"
		}, {
			description = "double 2: CCC\\nC\\nCT"
			, function = double2 ">goo\nCCC\nC\nCT\n"
			, expectedResult = "goo:CC,GG,CC,GG,CC,GG,CC,GG,CT,AG"
		}, {
			description = "double 2: CC\\nC*\\nT"
			, function = double2 ">goo\nCC\nC*\nT\n"
			, expectedResult = "goo:CC,GG,CC,GG"
		}, {
			description = "double 2: foo CATGAC goo TACCAG"
			, function = double2
				">foo\nCATGAC\n>goo\nTACCAG\n"
			, expectedResult = (
				"foo:CA,TG,AT,AT,TG,CA,GA,TC,AC,GT"
				^ ";goo:TA,TA,AC,GT,CC,GG,CA,TG,AG,CT"
			)
		}
	]
end