From b632667ce57af89691407bb8668e1512775278ae Mon Sep 17 00:00:00 2001 From: Calvin Date: Fri, 15 Mar 2013 15:26:20 -0400 Subject: nbc added --- src/nbc/tabulate.ml | 190 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 src/nbc/tabulate.ml (limited to 'src/nbc/tabulate.ml') diff --git a/src/nbc/tabulate.ml b/src/nbc/tabulate.ml new file mode 100644 index 0000000..dbfbe43 --- /dev/null +++ b/src/nbc/tabulate.ml @@ -0,0 +1,190 @@ +let (|>) a b = b a +let car, cdr = fst, snd +let odd n = n land 1 = 1 +let even n = not (odd n) +module ExtArray = ExtArray.Array +module ExtList = ExtList.List +module ExtString = ExtString.String +module Program = struct + let name = "Naive Bayes Classifier - Tabulate" + let version = "1.0" +end +let chop_extra s = ExtString.strip ~chars:"_- " s +let any_alphanumeric s = + let e = String.length s in + let rec loop i = + if i = e then false + else match s.[i] with + 'a'..'z' | 'A'..'Z' | '0'..'9' -> true + | _ -> loop (i + 1) + in loop 0 +let guess_prefix filenames = + let s = + List.map Misc.basename_without_extension filenames + |> Misc.longest_common_substring |> chop_extra + in + if any_alphanumeric s then s + else "" +module Options = struct + let parser = OptParse.OptParser.make ~version:Program.version () + let genomes = + let option = OptParse.StdOpt.str_option ~metavar:"file" () in + OptParse.OptParser.add parser ~short_name:'g' ~long_name:"genome-list" option; + (fun () -> match OptParse.Opt.opt option with + Some x -> x + | None -> + OptParse.OptParser.usage parser (); + exit 1 + ) + let columns = + let option = OptParse.StdOpt.int_option ~default:200 () in + OptParse.OptParser.add parser ~short_name:'c' ~long_name:"columns" option; + (fun () -> OptParse.Opt.get option) + let template = + let option = OptParse.StdOpt.str_option ~metavar:"template" + ~default:"$prefix-$n.csv.gz" () in + OptParse.OptParser.add parser ~short_name:'t' ~long_name:"output-template" option; + (fun () -> OptParse.Opt.get option) + let given_prefix = + let x = ref None in + OptParse.StdOpt.str_callback ~metavar:"prefix" (fun s -> x := Some s) + |> OptParse.OptParser.add parser ~short_name:'p' + ~long_name:"output-prefix"; + (fun () -> !x) + let files = OptParse.OptParser.parse_argv parser + let prefix = match given_prefix () with + None -> ( + match guess_prefix files with + "" -> "table" + | s -> s + ) | Some x -> x + let string_of_int n = Printf.sprintf "%08u" n + let output n = template () |> Misc.substitute ["prefix", prefix; "n", string_of_int n] +end + +exception Collide of string * string * string +let match_files_to_genomes genomes files = + let g = Array.of_list genomes in + let gl = Array.length g in + let gi = Array.init gl (fun i -> i) in + Array.sort (fun ai bi -> compare (String.length g.(bi)) (String.length g.(ai))) gi; + let gf = Array.make gl None in + List.iter (fun file -> + try ( + let i = ExtArray.find (fun i -> ExtString.exists file g.(i)) gi in + match gf.(i) with + None -> gf.(i) <- Some file + | Some other_file -> raise (Collide (g.(i), other_file, file)) + ) with Not_found -> () (* file does not match any genomes *) + ) files; + let r = ref [] in + for i = gl - 1 downto 0 do + match gf.(i) with + None -> () (* no file matches a given genome *) + | Some file -> r := (g.(i), file) :: !r + done; + !r + +let columns = Options.columns () +let genomes, files = + let genomes = Misc.io_of_file (Options.genomes ()) |> Misc.enum_of_lines + |> ExtList.of_enum in + let g, f = match_files_to_genomes genomes Options.files |> List.split in + Array.of_list g, + f |> List.map (fun x -> x, x |> open_in |> IO.input_channel) |> Array.of_list + +let newfile, newline, cell, finish = + let i = ref 0 in + let open_file () = + i := !i + 1; + let filename = Options.output !i in + Gzip.open_out filename + in + let c = ref None in + let force_out () = + match !c with + None -> + let d = open_file () in + c := Some d; + d + | Some d -> d + in + let start_of_line = ref true in + (* newfile *) (fun () -> + (match !c with + Some c -> Gzip.close_out c + | None -> ()); + i := !i + 1; + c := None + ), + (* newline *) (fun () -> + let c = force_out () in + Gzip.output_char c '\n'; + start_of_line := true + ), + (* cell *) (fun s -> + let c = force_out () in + if not !start_of_line then Gzip.output_char c ','; + Gzip.output c s 0 (String.length s); + start_of_line := false + ), + (* finish *) (fun () -> + match !c with + Some c -> Gzip.close_out c + | None -> () + ) + +let read_two_fields (filename, c) = + let line = IO.read_line c in + match Misc.split2 line with + Some x -> x + | None -> + Printf.eprintf "\ + There is something wrong with the file %s. \ + The offending line is:\n%s\n" filename line; + exit 1 + +let output_file () = + newfile (); + let yes_we_have_input = + let x = ref false in + (fun () -> + if !x then () + else ( + cell "names"; + x := true + ) + ) + in + let a = Array.make columns "" in + let rec loop i = + if i < columns then ( + try ( + let name, datum = read_two_fields files.(0) in + yes_we_have_input (); + cell name; + a.(i) <- datum; + loop (i + 1) + ) with IO.No_more_input -> + if i > 0 then Array.sub a 0 i + else raise End_of_file + ) else a + in + let a = loop 0 in + let these_columns = Array.length a in + newline (); + cell genomes.(0); + for i = 0 to these_columns - 1 do cell a.(i) done; + newline (); + for i = 1 to Array.length genomes - 1 do + cell genomes.(i); + for j = 0 to these_columns - 1 do + let _, datum = read_two_fields files.(i) in + cell datum + done; + newline () + done + +let () = + try while true do output_file () done + with End_of_file -> finish () -- cgit v1.2.3