From b632667ce57af89691407bb8668e1512775278ae Mon Sep 17 00:00:00 2001 From: Calvin Date: Fri, 15 Mar 2013 15:26:20 -0400 Subject: nbc added --- src/nbc/count.ml | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 src/nbc/count.ml (limited to 'src/nbc/count.ml') diff --git a/src/nbc/count.ml b/src/nbc/count.ml new file mode 100644 index 0000000..bd97b7b --- /dev/null +++ b/src/nbc/count.ml @@ -0,0 +1,60 @@ +module ExtArray = ExtArray.Array +let (|>) a b = b a + +module Options = struct + let parser = OptParse.OptParser.make ~version:"1.0" () + let per_word = + let option = OptParse.StdOpt.str_option ~metavar:"file" () in + OptParse.OptParser.add parser ~short_name:'w' ~long_name:"per-word" + ~help:"file to store per-word counts in" option; + (fun () -> match OptParse.Opt.opt option with + Some x -> x + | None -> + OptParse.OptParser.usage parser (); + exit 1 + ) + let total = + let option = OptParse.StdOpt.str_option ~metavar:"file" () in + OptParse.OptParser.add parser ~short_name:'t' ~long_name:"total" + ~help:"file to store total count in" option; + (fun () -> match OptParse.Opt.opt option with + Some x -> x + | None -> + OptParse.OptParser.usage parser (); + exit 1 + ) + let order = + let option = OptParse.StdOpt.int_option ~default:15 () in + OptParse.OptParser.add parser ~short_name:'r' ~long_name:"order" option; + (fun () -> OptParse.Opt.get option) + let files = OptParse.OptParser.parse_argv parser + let total = total () + let per_word = per_word () + let order = order () +end + +let load_file order judy total name = + Misc.io_of_gzip name |> Fasta.enum_words order + |> Enum.fold (fun word total -> + Judy.bump judy word; + Judy.bump judy (Gene.reverse word); + total + 2 + ) total +let load_files order names = + let judy = Judy.create () in + List.fold_left (load_file order judy) 0 names, judy +let gzip_output_string c s = Gzip.output c s 0 (String.length s) +let () = + let total_words, judy = load_files Options.order Options.files in + ( + let c = open_out Options.total in + output_string c (string_of_int total_words ^ "\n"); + close_out c + ); ( + let c = Gzip.open_out Options.per_word in + Judy.iter (fun word count -> + gzip_output_string c + (String.concat "" [string_of_int count; " "; word; "\n"]) + ) judy; + Gzip.close_out c + ) -- cgit v1.2.3