aboutsummaryrefslogtreecommitdiff
path: root/src/nbc/count.sml
blob: d0360501099bc8c61d110d207d19c2e00f95a133 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
datatype sides = Single | Double
datatype labeled = Labeled | Unlabeled
local
	(*
	val perWord = ref NONE
	val total = ref NONE
	*)
	val order = ref (SOME 15)
	val sides = ref Double
	val labeled = ref Labeled
	val optionsWithoutHelp = [
		(* {
			short = "w", long = ["per-word"]
			, desc = GetOpt.ReqArg (
				fn file => perWord := SOME file
				, "file"
			), help = "file to store per-word counts in"
		}, {
			short = "t", long = ["total"]
			, desc = GetOpt.ReqArg (
				fn file => total := SOME file
				, "file"
			), help = "file to store total count in"
		}, *) {
			short = "r", long = ["order"]
			, desc = GetOpt.ReqArg (
				fn size => order := Int.fromString size
				, "size"
			), help = "word size"
		}, {
			short = "1", long = ["single"]
			, desc = GetOpt.NoArg (fn () => sides := Single)
			, help = "only count one side"
		}, {
			short = "u", long = ["unlabeled"]
			, desc = GetOpt.NoArg (fn () => labeled := Unlabeled)
			, help = "emit counts for every possible nmer, without labels"
		}
	]
	fun usageString () = GetOpt.usageInfo {
		header = CommandLine.name () ^ " <options> <input FASTA file> ..."
		, options = optionsWithoutHelp
	} ^ "\n"
	datatype status = Success | Failure
	fun displayHelpAndExit status = (
		TextIO.output (
			TextIO.stdErr
			, usageString ()
		); OS.Process.exit (case status of
			Success => OS.Process.success
			| Failure => OS.Process.failure
		)
	)
	val options = {
		short = "h", long = ["help"]
		, desc = GetOpt.NoArg (fn () => displayHelpAndExit Success)
		, help = "display help"
	} :: optionsWithoutHelp
in
	val (_, files) = GetOpt.getOpt {
		argOrder = GetOpt.Permute
		, options = options
		, errFn = fn errorMessage => (
			TextIO.output (TextIO.stdErr, errorMessage ^ "\n")
			; displayHelpAndExit Failure
		)
	} (CommandLine.arguments ())
	(*
	val perWordFileName = case !perWord of
		NONE => (
			TextIO.output (
				stdErr
				, "per-word file name required but not provided\n"
			); displayHelpAndExit Failure
		) | SOME fileName => fileName
	val totalFileName = case !total of
		NONE => (
			TextIO.output (
				stdErr
				, "total file name required but not provided\n"
			); displayHelpAndExit Failure
		) | SOME fileName => fileName
	*)
	val order = case !order of
		NONE => (
			TextIO.output (
				TextIO.stdErr
				, "invalid order\n"
			); displayHelpAndExit Failure
		) | SOME integer => integer
	val sides = !sides
	val labeled = !labeled
end

signature COLLECTION = sig
	type collection
	type nmer
	val empty: unit -> collection
	val add: collection * nmer -> unit
	val get: collection * nmer -> int
	val app: (nmer * int -> unit) -> collection -> unit
end

functor Collection (Nmer: NMER)
:> COLLECTION where type nmer = Nmer.nmer = struct
	type nmer = Nmer.nmer
	structure Table = HashTableFn (
		type hash_key = nmer
		val hashVal = Nmer.hash
		val sameKey = Nmer.equal
	)
	type collection = int ref Table.hash_table
	exception NotFound
	fun empty () = Table.mkTable (256 * 1024, NotFound)
	fun add (table, nmer) = case Table.find table nmer of
		NONE => Table.insert table (nmer, ref 1)
		| SOME count => count := !count + 1
	fun get (table, nmer) = case Table.find table nmer of
		NONE => 0
		| SOME (ref count) => count
	fun app execute table = Table.appi (fn (nmer, ref count) =>
		execute (nmer, count)
	) table
end

datatype result = Success | Failure

signature OUTPUT = sig
	type collection
	val output: collection -> unit
end

functor Unlabeled (
	structure Nmer: NMER
	structure Collection: COLLECTION
		sharing type Collection.nmer = Nmer.nmer
) :> OUTPUT
	where type collection = Collection.collection
= struct
	type collection = Collection.collection
	fun put string = TextIO.output (TextIO.stdOut, string)
	fun single count = (
		put (Int.toString count)
		; put "\n"
	)
	fun output collection =
		let
			fun continue nmer = (
				single (Collection.get (collection, nmer))
				;
					if nmer = Nmer.maximum then ()
					else continue (Nmer.next nmer)
			)
		in
			continue (Nmer.minimum)
		end
end

functor Labeled (
	structure Nmer: NMER
	structure Collection: COLLECTION
		sharing type Collection.nmer = Nmer.nmer
) :> OUTPUT
	where type collection = Collection.collection
= struct
	type collection = Collection.collection
	fun put string = TextIO.output (TextIO.stdOut, string)
	fun single (nmer, count) = (
		put (Nmer.toString nmer)
		; put " "
		; put (Int.toString count)
		; put "\n"
	)
	fun output collection = Collection.app single collection
end

functor File (
	structure Collection: COLLECTION
	structure Output: OUTPUT
		sharing type Collection.collection = Output.collection
) :> FILE
	where type nmer = Collection.nmer
	where type result = result
	where type argument = unit
= struct
	type argument = unit
	type file = Collection.collection
	type read = unit
	type nmer = Collection.nmer
	type result = result
	fun startFile _ = Collection.empty ()
	fun startRead _ = ()
	fun nmer (counts, (), nmer) = Collection.add (counts, nmer)
	fun stopRead (_, ()) = ()
	fun stopFile counts = (
		Output.output counts
		; Success
	)
	fun invalidFormat file = Failure
end

functor Everything (Nmer: NMER) = struct
	structure Collection = Collection (Nmer)
	structure Unlabeled = File (
		structure Collection = Collection
		structure Output = Unlabeled (
			structure Nmer = Nmer
			structure Collection = Collection
		)
	)
	structure Labeled = File (
		structure Collection = Collection
		structure Output = Labeled (
			structure Nmer = Nmer
			structure Collection = Collection
		)
	)
	structure SingleSidedUnlabeled = SingleSidedFasta (
		structure Nmer = Nmer
		structure File = Unlabeled
	)
	structure DoubleSidedUnlabeled = DoubleSidedFasta (
		structure Nmer = Nmer
		structure File = Unlabeled
	)
	structure SingleSidedLabeled = SingleSidedFasta (
		structure Nmer = Nmer
		structure File = Labeled
	)
	structure DoubleSidedLabeled = DoubleSidedFasta (
		structure Nmer = Nmer
		structure File = Labeled
	)
end

structure Everything32 = Everything (
	Nmer (
		val order = order
		structure Word = Word32
	)
)
structure Everything64 = Everything (
	Nmer (
		val order = order
		structure Word = Word64
	)
)

val process =
	if order <= 32 then (case sides of
		Single => (case labeled of
			Unlabeled => Everything32.SingleSidedUnlabeled.process
			| Labeled => Everything32.SingleSidedLabeled.process
		) | Double => (case labeled of
			Unlabeled => Everything32.DoubleSidedUnlabeled.process
			| Labeled => Everything32.DoubleSidedLabeled.process
		)
	) else (case sides of
		Single => (case labeled of
			Unlabeled => Everything64.SingleSidedUnlabeled.process
			| Labeled => Everything64.SingleSidedLabeled.process
		) | Double => (case labeled of
			Unlabeled => Everything64.DoubleSidedUnlabeled.process
			| Labeled => Everything64.DoubleSidedLabeled.process
		)
	)

val () =
	let
		fun one name =
			let
				val instream = TextIO.openIn name
				val result = process ((), instream)
			in
				TextIO.closeIn instream
				; case result of
					Success => true
					| Failure => (
						TextIO.output (
							TextIO.stdErr
							, name
							^ ": invalid format\n"
						); false
					)
			end
		fun all names = List.all one names
	in
		if all files then ()
		else OS.Process.exit OS.Process.failure
	end