1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
|
datatype sides = Single | Double
datatype labeled = Labeled | Unlabeled
local
(*
val perWord = ref NONE
val total = ref NONE
*)
val order = ref (SOME 15)
val sides = ref Double
val labeled = ref Labeled
val optionsWithoutHelp = [
(* {
short = "w", long = ["per-word"]
, desc = GetOpt.ReqArg (
fn file => perWord := SOME file
, "file"
), help = "file to store per-word counts in"
}, {
short = "t", long = ["total"]
, desc = GetOpt.ReqArg (
fn file => total := SOME file
, "file"
), help = "file to store total count in"
}, *) {
short = "r", long = ["order"]
, desc = GetOpt.ReqArg (
fn size => order := Int.fromString size
, "size"
), help = "word size"
}, {
short = "1", long = ["single"]
, desc = GetOpt.NoArg (fn () => sides := Single)
, help = "only count one side"
}, {
short = "u", long = ["unlabeled"]
, desc = GetOpt.NoArg (fn () => labeled := Unlabeled)
, help = "emit counts for every possible nmer, without labels"
}
]
fun usageString () = GetOpt.usageInfo {
header = CommandLine.name () ^ " <options> <input FASTA file> ..."
, options = optionsWithoutHelp
} ^ "\n"
datatype status = Success | Failure
fun displayHelpAndExit status = (
TextIO.output (
TextIO.stdErr
, usageString ()
); OS.Process.exit (case status of
Success => OS.Process.success
| Failure => OS.Process.failure
)
)
val options = {
short = "h", long = ["help"]
, desc = GetOpt.NoArg (fn () => displayHelpAndExit Success)
, help = "display help"
} :: optionsWithoutHelp
in
val (_, files) = GetOpt.getOpt {
argOrder = GetOpt.Permute
, options = options
, errFn = fn errorMessage => (
TextIO.output (TextIO.stdErr, errorMessage ^ "\n")
; displayHelpAndExit Failure
)
} (CommandLine.arguments ())
(*
val perWordFileName = case !perWord of
NONE => (
TextIO.output (
stdErr
, "per-word file name required but not provided\n"
); displayHelpAndExit Failure
) | SOME fileName => fileName
val totalFileName = case !total of
NONE => (
TextIO.output (
stdErr
, "total file name required but not provided\n"
); displayHelpAndExit Failure
) | SOME fileName => fileName
*)
val order = case !order of
NONE => (
TextIO.output (
TextIO.stdErr
, "invalid order\n"
); displayHelpAndExit Failure
) | SOME integer => integer
val sides = !sides
val labeled = !labeled
end
signature COLLECTION = sig
type collection
type nmer
val empty: unit -> collection
val add: collection * nmer -> unit
val get: collection * nmer -> int
val app: (nmer * int -> unit) -> collection -> unit
end
functor Collection (Nmer: NMER)
:> COLLECTION where type nmer = Nmer.nmer = struct
type nmer = Nmer.nmer
structure Table = HashTableFn (
type hash_key = nmer
val hashVal = Nmer.hash
val sameKey = Nmer.equal
)
type collection = int ref Table.hash_table
exception NotFound
fun empty () = Table.mkTable (256 * 1024, NotFound)
fun add (table, nmer) = case Table.find table nmer of
NONE => Table.insert table (nmer, ref 1)
| SOME count => count := !count + 1
fun get (table, nmer) = case Table.find table nmer of
NONE => 0
| SOME (ref count) => count
fun app execute table = Table.appi (fn (nmer, ref count) =>
execute (nmer, count)
) table
end
datatype result = Success | Failure
signature OUTPUT = sig
type collection
val output: collection -> unit
end
functor Unlabeled (
structure Nmer: NMER
structure Collection: COLLECTION
sharing type Collection.nmer = Nmer.nmer
) :> OUTPUT
where type collection = Collection.collection
= struct
type collection = Collection.collection
fun put string = TextIO.output (TextIO.stdOut, string)
fun single count = (
put (Int.toString count)
; put "\n"
)
fun output collection =
let
fun continue nmer = (
single (Collection.get (collection, nmer))
;
if nmer = Nmer.maximum then ()
else continue (Nmer.next nmer)
)
in
continue (Nmer.minimum)
end
end
functor Labeled (
structure Nmer: NMER
structure Collection: COLLECTION
sharing type Collection.nmer = Nmer.nmer
) :> OUTPUT
where type collection = Collection.collection
= struct
type collection = Collection.collection
fun put string = TextIO.output (TextIO.stdOut, string)
fun single (nmer, count) = (
put (Nmer.toString nmer)
; put " "
; put (Int.toString count)
; put "\n"
)
fun output collection = Collection.app single collection
end
functor File (
structure Collection: COLLECTION
structure Output: OUTPUT
sharing type Collection.collection = Output.collection
) :> FILE
where type nmer = Collection.nmer
where type result = result
where type argument = unit
= struct
type argument = unit
type file = Collection.collection
type read = unit
type nmer = Collection.nmer
type result = result
fun startFile _ = Collection.empty ()
fun startRead _ = ()
fun nmer (counts, (), nmer) = Collection.add (counts, nmer)
fun stopRead (_, ()) = ()
fun stopFile counts = (
Output.output counts
; Success
)
fun invalidFormat file = Failure
end
functor Everything (Nmer: NMER) = struct
structure Collection = Collection (Nmer)
structure Unlabeled = File (
structure Collection = Collection
structure Output = Unlabeled (
structure Nmer = Nmer
structure Collection = Collection
)
)
structure Labeled = File (
structure Collection = Collection
structure Output = Labeled (
structure Nmer = Nmer
structure Collection = Collection
)
)
structure SingleSidedUnlabeled = SingleSidedFasta (
structure Nmer = Nmer
structure File = Unlabeled
)
structure DoubleSidedUnlabeled = DoubleSidedFasta (
structure Nmer = Nmer
structure File = Unlabeled
)
structure SingleSidedLabeled = SingleSidedFasta (
structure Nmer = Nmer
structure File = Labeled
)
structure DoubleSidedLabeled = DoubleSidedFasta (
structure Nmer = Nmer
structure File = Labeled
)
end
structure Everything32 = Everything (
Nmer (
val order = order
structure Word = Word32
)
)
structure Everything64 = Everything (
Nmer (
val order = order
structure Word = Word64
)
)
val process =
if order <= 32 then (case sides of
Single => (case labeled of
Unlabeled => Everything32.SingleSidedUnlabeled.process
| Labeled => Everything32.SingleSidedLabeled.process
) | Double => (case labeled of
Unlabeled => Everything32.DoubleSidedUnlabeled.process
| Labeled => Everything32.DoubleSidedLabeled.process
)
) else (case sides of
Single => (case labeled of
Unlabeled => Everything64.SingleSidedUnlabeled.process
| Labeled => Everything64.SingleSidedLabeled.process
) | Double => (case labeled of
Unlabeled => Everything64.DoubleSidedUnlabeled.process
| Labeled => Everything64.DoubleSidedLabeled.process
)
)
val () =
let
fun one name =
let
val instream = TextIO.openIn name
val result = process ((), instream)
in
TextIO.closeIn instream
; case result of
Success => true
| Failure => (
TextIO.output (
TextIO.stdErr
, name
^ ": invalid format\n"
); false
)
end
fun all names = List.all one names
in
if all files then ()
else OS.Process.exit OS.Process.failure
end
|