summaryrefslogtreecommitdiff
path: root/src/nbc/tabulate.ml
blob: dbfbe4355bb0c9ad01036f0cf40754b06d2aa43c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
let (|>) a b = b a
let car, cdr = fst, snd
let odd n = n land 1 = 1
let even n = not (odd n)
module ExtArray = ExtArray.Array
module ExtList = ExtList.List
module ExtString = ExtString.String
module Program = struct
	let name = "Naive Bayes Classifier - Tabulate"
	let version = "1.0"
end
let chop_extra s = ExtString.strip ~chars:"_- " s
let any_alphanumeric s =
	let e = String.length s in
	let rec loop i =
		if i = e then false
		else match s.[i] with
			'a'..'z' | 'A'..'Z' | '0'..'9' -> true
			| _ -> loop (i + 1)
	in loop 0
let guess_prefix filenames =
	let s =
		List.map Misc.basename_without_extension filenames
			|> Misc.longest_common_substring |> chop_extra
	in
	if any_alphanumeric s then s
	else ""
module Options = struct
	let parser = OptParse.OptParser.make ~version:Program.version ()
	let genomes =
		let option = OptParse.StdOpt.str_option ~metavar:"file" () in
		OptParse.OptParser.add parser ~short_name:'g' ~long_name:"genome-list" option;
		(fun () -> match OptParse.Opt.opt option with
			Some x -> x
			| None ->
				OptParse.OptParser.usage parser ();
				exit 1
		)
	let columns =
		let option = OptParse.StdOpt.int_option ~default:200 () in
		OptParse.OptParser.add parser ~short_name:'c' ~long_name:"columns" option;
		(fun () -> OptParse.Opt.get option)
	let template =
		let option = OptParse.StdOpt.str_option ~metavar:"template"
			~default:"$prefix-$n.csv.gz" () in
		OptParse.OptParser.add parser ~short_name:'t' ~long_name:"output-template" option;
		(fun () -> OptParse.Opt.get option)
	let given_prefix =
		let x = ref None in
		OptParse.StdOpt.str_callback ~metavar:"prefix" (fun s -> x := Some s)
			|> OptParse.OptParser.add parser ~short_name:'p'
				~long_name:"output-prefix";
		(fun () -> !x)
	let files = OptParse.OptParser.parse_argv parser
	let prefix = match given_prefix () with
		None -> (
			match guess_prefix files with
				"" -> "table"
				| s -> s
		) | Some x -> x
	let string_of_int n = Printf.sprintf "%08u" n
	let output n = template () |> Misc.substitute ["prefix", prefix; "n", string_of_int n]
end

exception Collide of string * string * string
let match_files_to_genomes genomes files =
	let g = Array.of_list genomes in
	let gl = Array.length g in
	let gi = Array.init gl (fun i -> i) in
	Array.sort (fun ai bi -> compare (String.length g.(bi)) (String.length g.(ai))) gi;
	let gf = Array.make gl None in
	List.iter (fun file ->
		try (
			let i = ExtArray.find (fun i -> ExtString.exists file g.(i)) gi in
			match gf.(i) with
				None -> gf.(i) <- Some file
				| Some other_file -> raise (Collide (g.(i), other_file, file))
		) with Not_found -> () (* file does not match any genomes *)
	) files;
	let r = ref [] in
	for i = gl - 1 downto 0 do
		match gf.(i) with
			None -> () (* no file matches a given genome *)
			| Some file -> r := (g.(i), file) :: !r
	done;
	!r

let columns = Options.columns ()
let genomes, files =
	let genomes = Misc.io_of_file (Options.genomes ()) |> Misc.enum_of_lines
		|> ExtList.of_enum in
	let g, f = match_files_to_genomes genomes Options.files |> List.split in
	Array.of_list g,
	f |> List.map (fun x -> x, x |> open_in |> IO.input_channel) |> Array.of_list

let newfile, newline, cell, finish = 
	let i = ref 0 in
	let open_file () =
		i := !i + 1;
		let filename = Options.output !i in
		Gzip.open_out filename
	in
	let c = ref None in
	let force_out () =
		match !c with
			None ->
				let d = open_file () in
				c := Some d;
				d
			| Some d -> d
	in
	let start_of_line = ref true in
	(* newfile *) (fun () ->
		(match !c with
			Some c -> Gzip.close_out c
			| None -> ());
		i := !i + 1;
		c := None
	),
	(* newline *) (fun () ->
		let c = force_out () in
		Gzip.output_char c '\n';
		start_of_line := true
	),
	(* cell *) (fun s ->
		let c = force_out () in
		if not !start_of_line then Gzip.output_char c ',';
		Gzip.output c s 0 (String.length s);
		start_of_line := false
	),
	(* finish *) (fun () ->
		match !c with
			Some c -> Gzip.close_out c
			| None -> ()
	)

let read_two_fields (filename, c) =
		let line = IO.read_line c in
		match Misc.split2 line with
			Some x -> x
			| None ->
				Printf.eprintf "\
					There is something wrong with the file %s. \
					The offending line is:\n%s\n" filename line;
				exit 1

let output_file () =
	newfile ();
	let yes_we_have_input =
		let x = ref false in
		(fun () ->
			if !x then ()
			else (
				cell "names";
				x := true
			)
		)
	in
	let a = Array.make columns "" in
	let rec loop i =
		if i < columns then (
			try (
				let name, datum = read_two_fields files.(0) in
				yes_we_have_input ();
				cell name;
				a.(i) <- datum;
				loop (i + 1)
			) with IO.No_more_input ->
				if i > 0 then Array.sub a 0 i
				else raise End_of_file
		) else a
	in
	let a = loop 0 in
	let these_columns = Array.length a in
	newline ();
	cell genomes.(0);
	for i = 0 to these_columns - 1 do cell a.(i) done;
	newline ();
	for i = 1 to Array.length genomes - 1 do
		cell genomes.(i);
		for j = 0 to these_columns - 1 do
			let _, datum = read_two_fields files.(i) in
			cell datum
		done;
		newline ()
	done

let () =
	try while true do output_file () done
	with End_of_file -> finish ()