summaryrefslogtreecommitdiff
path: root/src/nbc/probabilities-by-read.sml
diff options
context:
space:
mode:
Diffstat (limited to 'src/nbc/probabilities-by-read.sml')
-rw-r--r--src/nbc/probabilities-by-read.sml210
1 files changed, 210 insertions, 0 deletions
diff --git a/src/nbc/probabilities-by-read.sml b/src/nbc/probabilities-by-read.sml
new file mode 100644
index 0000000..acd8041
--- /dev/null
+++ b/src/nbc/probabilities-by-read.sml
@@ -0,0 +1,210 @@
+local
+ fun usage result = (
+ TextIO.output (
+ TextIO.stdErr
+ , CommandLine.name ()
+ ^ " <order> <input FASTA> <file of nmers to count>\n"
+ ); OS.Process.exit result
+ )
+in
+ val (order, input, toCount) = case CommandLine.arguments () of
+ [order, input, toCount] => (case Int.fromString order of
+ NONE => usage OS.Process.failure
+ | SOME order => (order, input, toCount)
+ ) | ["--help"] => usage OS.Process.success
+ | _ => usage OS.Process.failure
+end
+
+datatype result = Success | Failure
+fun warn message = TextIO.output (
+ TextIO.stdErr
+ , (
+ "warning: "
+ ^ message
+ ^ "\n"
+ )
+)
+
+signature NMER_TABLE = sig
+ type nmer
+ type table
+
+ (*
+ Create the table. Only the provided nmers will be counted.
+ *)
+ val create: nmer list -> table
+
+ (*
+ Increment the count for a given nmer. If the nmer was
+ not provided at table creation, do nothing.
+ *)
+ val bump: table * nmer -> unit
+
+ (*
+ Reset all counts to zero. Do not change the list of
+ nmers to count.
+ *)
+ val clear: table -> unit
+
+ (*
+ Apply a function to all nmers and their counts, in
+ lexicographic order.
+ *)
+ val app: (nmer * int -> unit) -> table -> unit
+end
+
+functor NmerTable (Nmer: NMER)
+:> NMER_TABLE where type nmer = Nmer.nmer
+= struct
+ structure HashTable = HashTableFn (
+ type hash_key = Nmer.nmer
+ val hashVal = Nmer.hash
+ val sameKey = Nmer.equal
+ )
+ exception NotFound
+ type nmer = Nmer.nmer
+ type table = {
+ indexes: int HashTable.hash_table
+ , counts: int array
+ , nmers: nmer vector
+ }
+ fun create list =
+ let
+ val indexes = HashTable.mkTable (1024, NotFound)
+ val nmers =
+ let
+ val array = Array.fromList list
+ in
+ ArrayQSort.sort Nmer.compare array
+ ; Array.vector array
+ end
+ in
+ Vector.appi (fn (index, nmer) =>
+ HashTable.insert indexes (nmer, index)
+ ) nmers
+ ; {
+ indexes = indexes
+ , nmers = nmers
+ , counts = Array.array (
+ Vector.length nmers
+ , 0
+ )
+ }
+ end
+ fun bump ({indexes, nmers = _, counts}, nmer) =
+ case HashTable.find indexes nmer of
+ NONE => ()
+ | SOME index => Array.update (
+ counts
+ , index
+ , Array.sub (counts, index) + 1
+ )
+ fun clear {indexes = _, nmers = _, counts} =
+ Array.modify (fn _ => 0) counts
+ fun app execute {indexes = _, nmers, counts} =
+ Vector.appi (fn (index, nmer) =>
+ execute (nmer, Array.sub (counts, index))
+ ) nmers
+end
+
+functor Input (
+ structure Nmer: NMER
+ structure NmerTable: NMER_TABLE
+ sharing type Nmer.nmer = NmerTable.nmer
+) = SingleSidedFasta (
+ structure Nmer = Nmer
+ structure File = struct
+ type argument = NmerTable.table
+ type file = argument
+ type read = Int64.int ref
+ type nmer = Nmer.nmer
+ datatype result = datatype result
+ exception NotFound
+ fun startFile table = table
+ fun startRead (table, _) = ref (0: Int64.int)
+ fun nmer (table, (total: Int64.int ref), nmer) = (
+ total := !total + 1
+ ; NmerTable.bump (table, nmer)
+ )
+ fun put string = TextIO.output (TextIO.stdOut, string)
+ fun stopRead (table, total) =
+ let
+ val realFromInt64 =
+ Real.fromLargeInt o Int64.toLarge
+ val realTotal = realFromInt64 (!total)
+ val toString = Real.fmt (
+ StringCvt.FIX (SOME 17)
+ )
+ fun probability count = real count / realTotal
+ infix |>
+ fun argument |> function = function argument
+ val first = ref true
+ in
+ NmerTable.app (fn (nmer, count) => (
+ if !first then first := false
+ else put "\t"
+ ; count
+ |> probability
+ |> toString
+ |> put
+ )) table
+ ; put "\n"
+ ; NmerTable.clear table
+ ; total := 0
+ end
+ fun stopFile _ = Success
+ fun invalidFormat _ = Failure
+ end
+)
+
+structure Nmer = Nmer (
+ val order = order
+ structure Word = Word64
+)
+structure NmerTable = NmerTable (Nmer)
+structure Input = Input (
+ structure Nmer = Nmer
+ structure NmerTable = NmerTable
+)
+
+val table =
+ let
+ fun build collect goose = case collect goose of
+ NONE => nil
+ | SOME egg =>
+ egg :: build collect goose
+ fun chopEnd string = String.extract (
+ string
+ , 0
+ , SOME (size string - (
+ if String.isSuffix "\r\n" string
+ then 2
+ else 1
+ ))
+ )
+ val line = Option.map chopEnd o TextIO.inputLine
+ val lines = build line
+ val instream = TextIO.openIn toCount
+ fun fromStringWithWarning string =
+ case Nmer.fromString string of
+ NONE => (
+ warn (
+ string
+ ^ " is not a valid nmer"
+ ); NONE
+ ) | someNmer => someNmer
+ val nmers = List.mapPartial
+ fromStringWithWarning
+ (lines instream)
+ in
+ TextIO.closeIn instream
+ ; NmerTable.create nmers
+ end
+
+val () = case Input.process (table, TextIO.openIn input) of
+ Failure => (
+ TextIO.output (
+ TextIO.stdErr
+ , "input is not valid FASTA\n"
+ ); OS.Process.exit OS.Process.failure
+ ) | Success => ()