1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
|
local
fun usage result = (
TextIO.output (
TextIO.stdErr
, CommandLine.name ()
^ " <order> <input FASTA> <file of nmers to count>\n"
); OS.Process.exit result
)
in
val (order, input, toCount) = case CommandLine.arguments () of
[order, input, toCount] => (case Int.fromString order of
NONE => usage OS.Process.failure
| SOME order => (order, input, toCount)
) | ["--help"] => usage OS.Process.success
| _ => usage OS.Process.failure
end
datatype result = Success | Failure
fun warn message = TextIO.output (
TextIO.stdErr
, (
"warning: "
^ message
^ "\n"
)
)
signature NMER_TABLE = sig
type nmer
type table
(*
Create the table. Only the provided nmers will be counted.
*)
val create: nmer list -> table
(*
Increment the count for a given nmer. If the nmer was
not provided at table creation, do nothing.
*)
val bump: table * nmer -> unit
(*
Reset all counts to zero. Do not change the list of
nmers to count.
*)
val clear: table -> unit
(*
Apply a function to all nmers and their counts, in
lexicographic order.
*)
val app: (nmer * int -> unit) -> table -> unit
end
functor NmerTable (Nmer: NMER)
:> NMER_TABLE where type nmer = Nmer.nmer
= struct
structure HashTable = HashTableFn (
type hash_key = Nmer.nmer
val hashVal = Nmer.hash
val sameKey = Nmer.equal
)
exception NotFound
type nmer = Nmer.nmer
type table = {
indexes: int HashTable.hash_table
, counts: int array
, nmers: nmer vector
}
fun create list =
let
val indexes = HashTable.mkTable (1024, NotFound)
val nmers =
let
val array = Array.fromList list
in
ArrayQSort.sort Nmer.compare array
; Array.vector array
end
in
Vector.appi (fn (index, nmer) =>
HashTable.insert indexes (nmer, index)
) nmers
; {
indexes = indexes
, nmers = nmers
, counts = Array.array (
Vector.length nmers
, 0
)
}
end
fun bump ({indexes, nmers = _, counts}, nmer) =
case HashTable.find indexes nmer of
NONE => ()
| SOME index => Array.update (
counts
, index
, Array.sub (counts, index) + 1
)
fun clear {indexes = _, nmers = _, counts} =
Array.modify (fn _ => 0) counts
fun app execute {indexes = _, nmers, counts} =
Vector.appi (fn (index, nmer) =>
execute (nmer, Array.sub (counts, index))
) nmers
end
functor Input (
structure Nmer: NMER
structure NmerTable: NMER_TABLE
sharing type Nmer.nmer = NmerTable.nmer
) = SingleSidedFasta (
structure Nmer = Nmer
structure File = struct
type argument = NmerTable.table
type file = argument
type read = Int64.int ref
type nmer = Nmer.nmer
datatype result = datatype result
exception NotFound
fun startFile table = table
fun startRead (table, _) = ref (0: Int64.int)
fun nmer (table, (total: Int64.int ref), nmer) = (
total := !total + 1
; NmerTable.bump (table, nmer)
)
fun put string = TextIO.output (TextIO.stdOut, string)
fun stopRead (table, total) =
let
val realFromInt64 =
Real.fromLargeInt o Int64.toLarge
val realTotal = realFromInt64 (!total)
val toString = Real.fmt (
StringCvt.FIX (SOME 17)
)
fun probability count = real count / realTotal
infix |>
fun argument |> function = function argument
val first = ref true
in
NmerTable.app (fn (nmer, count) => (
if !first then first := false
else put "\t"
; count
|> probability
|> toString
|> put
)) table
; put "\n"
; NmerTable.clear table
; total := 0
end
fun stopFile _ = Success
fun invalidFormat _ = Failure
end
)
structure Nmer = Nmer (
val order = order
structure Word = Word64
)
structure NmerTable = NmerTable (Nmer)
structure Input = Input (
structure Nmer = Nmer
structure NmerTable = NmerTable
)
val table =
let
fun build collect goose = case collect goose of
NONE => nil
| SOME egg =>
egg :: build collect goose
fun chopEnd string = String.extract (
string
, 0
, SOME (size string - (
if String.isSuffix "\r\n" string
then 2
else 1
))
)
val line = Option.map chopEnd o TextIO.inputLine
val lines = build line
val instream = TextIO.openIn toCount
fun fromStringWithWarning string =
case Nmer.fromString string of
NONE => (
warn (
string
^ " is not a valid nmer"
); NONE
) | someNmer => someNmer
val nmers = List.mapPartial
fromStringWithWarning
(lines instream)
in
TextIO.closeIn instream
; NmerTable.create nmers
end
val () = case Input.process (table, TextIO.openIn input) of
Failure => (
TextIO.output (
TextIO.stdErr
, "input is not valid FASTA\n"
); OS.Process.exit OS.Process.failure
) | Success => ()
|