1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
let (|>) a b = b a
let car, cdr = fst, snd
let odd n = n land 1 = 1
let even n = not (odd n)
module ExtArray = ExtArray.Array
module ExtList = ExtList.List
module ExtString = ExtString.String
module Program = struct
let name = "Naive Bayes Classifier - Tabulate"
let version = "1.0"
end
let chop_extra s = ExtString.strip ~chars:"_- " s
let any_alphanumeric s =
let e = String.length s in
let rec loop i =
if i = e then false
else match s.[i] with
'a'..'z' | 'A'..'Z' | '0'..'9' -> true
| _ -> loop (i + 1)
in loop 0
let guess_prefix filenames =
let s =
List.map Misc.basename_without_extension filenames
|> Misc.longest_common_substring |> chop_extra
in
if any_alphanumeric s then s
else ""
module Options = struct
let parser = OptParse.OptParser.make ~version:Program.version ()
let genomes =
let option = OptParse.StdOpt.str_option ~metavar:"file" () in
OptParse.OptParser.add parser ~short_name:'g' ~long_name:"genome-list" option;
(fun () -> match OptParse.Opt.opt option with
Some x -> x
| None ->
OptParse.OptParser.usage parser ();
exit 1
)
let columns =
let option = OptParse.StdOpt.int_option ~default:200 () in
OptParse.OptParser.add parser ~short_name:'c' ~long_name:"columns" option;
(fun () -> OptParse.Opt.get option)
let template =
let option = OptParse.StdOpt.str_option ~metavar:"template"
~default:"$prefix-$n.csv.gz" () in
OptParse.OptParser.add parser ~short_name:'t' ~long_name:"output-template" option;
(fun () -> OptParse.Opt.get option)
let given_prefix =
let x = ref None in
OptParse.StdOpt.str_callback ~metavar:"prefix" (fun s -> x := Some s)
|> OptParse.OptParser.add parser ~short_name:'p'
~long_name:"output-prefix";
(fun () -> !x)
let files = OptParse.OptParser.parse_argv parser
let prefix = match given_prefix () with
None -> (
match guess_prefix files with
"" -> "table"
| s -> s
) | Some x -> x
let string_of_int n = Printf.sprintf "%08u" n
let output n = template () |> Misc.substitute ["prefix", prefix; "n", string_of_int n]
end
exception Collide of string * string * string
let match_files_to_genomes genomes files =
let g = Array.of_list genomes in
let gl = Array.length g in
let gi = Array.init gl (fun i -> i) in
Array.sort (fun ai bi -> compare (String.length g.(bi)) (String.length g.(ai))) gi;
let gf = Array.make gl None in
List.iter (fun file ->
try (
let i = ExtArray.find (fun i -> ExtString.exists file g.(i)) gi in
match gf.(i) with
None -> gf.(i) <- Some file
| Some other_file -> raise (Collide (g.(i), other_file, file))
) with Not_found -> () (* file does not match any genomes *)
) files;
let r = ref [] in
for i = gl - 1 downto 0 do
match gf.(i) with
None -> () (* no file matches a given genome *)
| Some file -> r := (g.(i), file) :: !r
done;
!r
let columns = Options.columns ()
let genomes, files =
let genomes = Misc.io_of_file (Options.genomes ()) |> Misc.enum_of_lines
|> ExtList.of_enum in
let g, f = match_files_to_genomes genomes Options.files |> List.split in
Array.of_list g,
f |> List.map (fun x -> x, x |> open_in |> IO.input_channel) |> Array.of_list
let newfile, newline, cell, finish =
let i = ref 0 in
let open_file () =
i := !i + 1;
let filename = Options.output !i in
Gzip.open_out filename
in
let c = ref None in
let force_out () =
match !c with
None ->
let d = open_file () in
c := Some d;
d
| Some d -> d
in
let start_of_line = ref true in
(* newfile *) (fun () ->
(match !c with
Some c -> Gzip.close_out c
| None -> ());
i := !i + 1;
c := None
),
(* newline *) (fun () ->
let c = force_out () in
Gzip.output_char c '\n';
start_of_line := true
),
(* cell *) (fun s ->
let c = force_out () in
if not !start_of_line then Gzip.output_char c ',';
Gzip.output c s 0 (String.length s);
start_of_line := false
),
(* finish *) (fun () ->
match !c with
Some c -> Gzip.close_out c
| None -> ()
)
let read_two_fields (filename, c) =
let line = IO.read_line c in
match Misc.split2 line with
Some x -> x
| None ->
Printf.eprintf "\
There is something wrong with the file %s. \
The offending line is:\n%s\n" filename line;
exit 1
let output_file () =
newfile ();
let yes_we_have_input =
let x = ref false in
(fun () ->
if !x then ()
else (
cell "names";
x := true
)
)
in
let a = Array.make columns "" in
let rec loop i =
if i < columns then (
try (
let name, datum = read_two_fields files.(0) in
yes_we_have_input ();
cell name;
a.(i) <- datum;
loop (i + 1)
) with IO.No_more_input ->
if i > 0 then Array.sub a 0 i
else raise End_of_file
) else a
in
let a = loop 0 in
let these_columns = Array.length a in
newline ();
cell genomes.(0);
for i = 0 to these_columns - 1 do cell a.(i) done;
newline ();
for i = 1 to Array.length genomes - 1 do
cell genomes.(i);
for j = 0 to these_columns - 1 do
let _, datum = read_two_fields files.(i) in
cell datum
done;
newline ()
done
let () =
try while true do output_file () done
with End_of_file -> finish ()
|