summaryrefslogtreecommitdiff
path: root/src/nbc/probabilities-by-read.sml
blob: acd80411ea9651a70c60c28167c13ea7b105ffe9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
local
	fun usage result = (
		TextIO.output (
			TextIO.stdErr
			, CommandLine.name ()
		^ " <order> <input FASTA> <file of nmers to count>\n"
		); OS.Process.exit result
	)
in
	val (order, input, toCount) = case CommandLine.arguments () of
		[order, input, toCount] => (case Int.fromString order of
			NONE => usage OS.Process.failure
			| SOME order => (order, input, toCount)
		) | ["--help"] => usage OS.Process.success
		| _ => usage OS.Process.failure
end

datatype result = Success | Failure
fun warn message = TextIO.output (
	TextIO.stdErr
	, (
		"warning: "
		^ message
		^ "\n"
	)
)

signature NMER_TABLE = sig
	type nmer
	type table

	(*
		Create the table. Only the provided nmers will be counted.
	*)
	val create: nmer list -> table

	(*
		Increment the count for a given nmer. If the nmer was
		not provided at table creation, do nothing.
	*)
	val bump: table * nmer -> unit

	(*
		Reset all counts to zero. Do not change the list of
		nmers to count.
	*)
	val clear: table -> unit

	(*
		Apply a function to all nmers and their counts, in
		lexicographic order.
	*)
	val app: (nmer * int -> unit) -> table -> unit
end

functor NmerTable (Nmer: NMER)
:> NMER_TABLE where type nmer = Nmer.nmer
= struct
	structure HashTable = HashTableFn (
		type hash_key = Nmer.nmer
		val hashVal = Nmer.hash
		val sameKey = Nmer.equal
	)
	exception NotFound
	type nmer = Nmer.nmer
	type table = {
		indexes: int HashTable.hash_table
		, counts: int array
		, nmers: nmer vector
	}
	fun create list =
		let
			val indexes = HashTable.mkTable (1024, NotFound)
			val nmers =
				let
					val array = Array.fromList list
				in
					ArrayQSort.sort Nmer.compare array
					; Array.vector array
				end
		in
			Vector.appi (fn (index, nmer) =>
				HashTable.insert indexes (nmer, index)
			) nmers
			; {
				indexes = indexes
				, nmers = nmers
				, counts = Array.array (
					Vector.length nmers
					, 0
				)
			}
		end
	fun bump ({indexes, nmers = _, counts}, nmer) =
		case HashTable.find indexes nmer of
			NONE => ()
			| SOME index => Array.update (
				counts
				, index
				, Array.sub (counts, index) + 1
			)
	fun clear {indexes = _, nmers = _, counts} =
		Array.modify (fn _ => 0) counts
	fun app execute {indexes = _, nmers, counts} =
		Vector.appi (fn (index, nmer) =>
			execute (nmer, Array.sub (counts, index))
		) nmers
end

functor Input (
	structure Nmer: NMER
	structure NmerTable: NMER_TABLE
	sharing type Nmer.nmer = NmerTable.nmer
) = SingleSidedFasta (
	structure Nmer = Nmer
	structure File = struct
		type argument = NmerTable.table
		type file = argument
		type read = Int64.int ref
		type nmer = Nmer.nmer
		datatype result = datatype result
		exception NotFound
		fun startFile table = table
		fun startRead (table, _) = ref (0: Int64.int)
		fun nmer (table, (total: Int64.int ref), nmer) = (
			total := !total + 1
			; NmerTable.bump (table, nmer)
		)
		fun put string = TextIO.output (TextIO.stdOut, string)
		fun stopRead (table, total) =
			let
				val realFromInt64 =
					Real.fromLargeInt o Int64.toLarge
				val realTotal = realFromInt64 (!total)
				val toString = Real.fmt (
					StringCvt.FIX (SOME 17)
				)
				fun probability count = real count / realTotal
				infix |>
				fun argument |> function = function argument
				val first = ref true
			in
				NmerTable.app (fn (nmer, count) => (
					if !first then first := false
					else put "\t"
					; count
						|> probability
						|> toString
						|> put
				)) table
				; put "\n"
				; NmerTable.clear table
				; total := 0
			end
		fun stopFile _ = Success
		fun invalidFormat _ = Failure
	end
)

structure Nmer = Nmer (
	val order = order
	structure Word = Word64
)
structure NmerTable = NmerTable (Nmer)
structure Input = Input (
	structure Nmer = Nmer
	structure NmerTable = NmerTable
)

val table =
	let
		fun build collect goose = case collect goose of
			NONE => nil
			| SOME egg =>
				egg :: build collect goose
		fun chopEnd string = String.extract (
			string
			, 0
			, SOME (size string - (
				if String.isSuffix "\r\n" string
				then 2
				else 1
			))
		)
		val line = Option.map chopEnd o TextIO.inputLine
		val lines = build line
		val instream = TextIO.openIn toCount
		fun fromStringWithWarning string =
			case Nmer.fromString string of
				NONE => (
					warn (
						string
						^ " is not a valid nmer"
					); NONE
				) | someNmer => someNmer
		val nmers = List.mapPartial
			fromStringWithWarning
			(lines instream)
	in
		TextIO.closeIn instream
		; NmerTable.create nmers
	end

val () = case Input.process (table, TextIO.openIn input) of
	Failure => (
		TextIO.output (
			TextIO.stdErr
			, "input is not valid FASTA\n"
		); OS.Process.exit OS.Process.failure
	) | Success => ()