1 '''
2 The FEAST module provides an interface between the C-library
3 for feature selection to Python.
4
5 References:
6 1) G. Brown, A. Pocock, M.-J. Zhao, and M. Lujan, "Conditional
7 likelihood maximization: A unifying framework for information
8 theoretic feature selection," Journal of Machine Learning
9 Research, vol. 13, pp. 27-66, 2012.
10
11 '''
12 __author__ = "Calvin Morrison"
13 __copyright__ = "Copyright 2013, EESI Laboratory"
14 __credits__ = ["Calvin Morrison", "Gregory Ditzler"]
15 __license__ = "GPL"
16 __version__ = "0.2.0"
17 __maintainer__ = "Calvin Morrison"
18 __email__ = "mutantturkey@gmail.com"
19 __status__ = "Release"
20
21 import numpy as np
22 import ctypes as c
23
24 try:
25 libFSToolbox = c.CDLL("libFSToolbox.so");
26 except:
27 raise Exception("Error: could not load libFSToolbox.so")
28
29
30 -def BetaGamma(data, labels, n_select, beta=1.0, gamma=1.0):
31 '''
32 This algorithm implements conditional mutual information
33 feature select, such that beta and gamma control the
34 weight attached to the redundant mutual and conditional
35 mutual information, respectively.
36
37 @param data: data in a Numpy array such that len(data) =
38 n_observations, and len(data.transpose()) = n_features
39 (REQUIRED)
40 @type data: ndarray
41 @param labels: labels represented in a numpy list with
42 n_observations as the number of elements. That is
43 len(labels) = len(data) = n_observations.
44 (REQUIRED)
45 @type labels: ndarray
46 @param n_select: number of features to select. (REQUIRED)
47 @type n_select: integer
48 @param beta: penalty attacted to I(X_j;X_k)
49 @type beta: float between 0 and 1.0
50 @param gamma: positive weight attached to the conditional
51 redundancy term I(X_k;X_j|Y)
52 @type gamma: float between 0 and 1.0
53 @return: features in the order they were selected.
54 @rtype: list
55 '''
56 data, labels = check_data(data, labels)
57
58
59 n_observations, n_features = data.shape
60 output = np.zeros(n_select)
61
62
63 c_n_observations = c.c_int(n_observations)
64 c_n_select = c.c_int(n_select)
65 c_n_features = c.c_int(n_features)
66 c_beta = c.c_double(beta)
67 c_gamma = c.c_double(gamma)
68
69 libFSToolbox.BetaGamma.restype = c.POINTER(c.c_double * n_select)
70 features = libFSToolbox.BetaGamma(c_n_select,
71 c_n_observations,
72 c_n_features,
73 data.ctypes.data_as(c.POINTER(c.c_double)),
74 labels.ctypes.data_as(c.POINTER(c.c_double)),
75 output.ctypes.data_as(c.POINTER(c.c_double)),
76 c_beta,
77 c_gamma
78 )
79
80
81 selected_features = []
82 for i in features.contents:
83
84
85
86 selected_features.append(i - 1)
87
88 return selected_features
89
90
91
92 -def CIFE(data, labels, n_select):
93 '''
94 This function implements the Condred feature selection algorithm.
95 beta = 1; gamma = 1;
96
97 @param data: A Numpy array such that len(data) =
98 n_observations, and len(data.transpose()) = n_features
99 @type data: ndarray
100 @param labels: labels represented in a numpy list with
101 n_observations as the number of elements. That is
102 len(labels) = len(data) = n_observations.
103 @type labels: ndarray
104 @param n_select: number of features to select.
105 @type n_select: integer
106 @return selected_features: features in the order they were selected.
107 @rtype: list
108 '''
109
110 return BetaGamma(data, labels, n_select, beta=1.0, gamma=1.0)
111
112
113
114
115 -def CMIM(data, labels, n_select):
116 '''
117 This function implements the conditional mutual information
118 maximization feature selection algorithm. Note that this
119 implementation does not allow for the weighting of the
120 redundancy terms that BetaGamma will allow you to do.
121
122 @param data: A Numpy array such that len(data) =
123 n_observations, and len(data.transpose()) = n_features
124 @type data: ndarray
125 @param labels: labels represented in a numpy array with
126 n_observations as the number of elements. That is
127 len(labels) = len(data) = n_observations.
128 @type labels: ndarray
129 @param n_select: number of features to select.
130 @type n_select: integer
131 @return: features in the order that they were selected.
132 @rtype: list
133 '''
134 data, labels = check_data(data, labels)
135
136
137 n_observations, n_features = data.shape
138 output = np.zeros(n_select)
139
140
141 c_n_observations = c.c_int(n_observations)
142 c_n_select = c.c_int(n_select)
143 c_n_features = c.c_int(n_features)
144
145 libFSToolbox.CMIM.restype = c.POINTER(c.c_double * n_select)
146 features = libFSToolbox.CMIM(c_n_select,
147 c_n_observations,
148 c_n_features,
149 data.ctypes.data_as(c.POINTER(c.c_double)),
150 labels.ctypes.data_as(c.POINTER(c.c_double)),
151 output.ctypes.data_as(c.POINTER(c.c_double))
152 )
153
154
155
156 selected_features = []
157 for i in features.contents:
158
159
160
161 selected_features.append(i - 1)
162
163 return selected_features
164
165
166
167 -def CondMI(data, labels, n_select):
168 '''
169 This function implements the conditional mutual information
170 maximization feature selection algorithm.
171
172 @param data: data in a Numpy array such that len(data) = n_observations,
173 and len(data.transpose()) = n_features
174 @type data: ndarray
175 @param labels: represented in a numpy list with
176 n_observations as the number of elements. That is
177 len(labels) = len(data) = n_observations.
178 @type labels: ndarray
179 @param n_select: number of features to select.
180 @type n_select: integer
181 @return: features in the order they were selected.
182 @rtype list
183 '''
184 data, labels = check_data(data, labels)
185
186
187 n_observations, n_features = data.shape
188 output = np.zeros(n_select)
189
190
191 c_n_observations = c.c_int(n_observations)
192 c_n_select = c.c_int(n_select)
193 c_n_features = c.c_int(n_features)
194
195 libFSToolbox.CondMI.restype = c.POINTER(c.c_double * n_select)
196 features = libFSToolbox.CondMI(c_n_select,
197 c_n_observations,
198 c_n_features,
199 data.ctypes.data_as(c.POINTER(c.c_double)),
200 labels.ctypes.data_as(c.POINTER(c.c_double)),
201 output.ctypes.data_as(c.POINTER(c.c_double))
202 )
203
204
205
206 selected_features = []
207 for i in features.contents:
208
209
210
211 selected_features.append(i - 1)
212
213 return selected_features
214
215
216 -def Condred(data, labels, n_select):
217 '''
218 This function implements the Condred feature selection algorithm.
219 beta = 0; gamma = 1;
220
221 @param data: data in a Numpy array such that len(data) =
222 n_observations, and len(data.transpose()) = n_features
223 @type data: ndarray
224 @param labels: labels represented in a numpy list with
225 n_observations as the number of elements. That is
226 len(labels) = len(data) = n_observations.
227 @type labels: ndarray
228 @param n_select: number of features to select.
229 @type n_select: integer
230 @return: the features in the order they were selected.
231 @rtype: list
232 '''
233 data, labels = check_data(data, labels)
234
235 return BetaGamma(data, labels, n_select, beta=0.0, gamma=1.0)
236
237
238
239 -def DISR(data, labels, n_select):
240 '''
241 This function implements the double input symmetrical relevance
242 feature selection algorithm.
243
244 @param data: data in a Numpy array such that len(data) =
245 n_observations, and len(data.transpose()) = n_features
246 @type data: ndarray
247 @param labels: labels represented in a numpy list with
248 n_observations as the number of elements. That is
249 len(labels) = len(data) = n_observations.
250 @type labels: ndarray
251 @param n_select: number of features to select. (REQUIRED)
252 @type n_select: integer
253 @return: the features in the order they were selected.
254 @rtype: list
255 '''
256 data, labels = check_data(data, labels)
257
258
259 n_observations, n_features = data.shape
260 output = np.zeros(n_select)
261
262
263 c_n_observations = c.c_int(n_observations)
264 c_n_select = c.c_int(n_select)
265 c_n_features = c.c_int(n_features)
266
267 libFSToolbox.DISR.restype = c.POINTER(c.c_double * n_select)
268 features = libFSToolbox.DISR(c_n_select,
269 c_n_observations,
270 c_n_features,
271 data.ctypes.data_as(c.POINTER(c.c_double)),
272 labels.ctypes.data_as(c.POINTER(c.c_double)),
273 output.ctypes.data_as(c.POINTER(c.c_double))
274 )
275
276
277
278 selected_features = []
279 for i in features.contents:
280
281
282
283 selected_features.append(i - 1)
284
285 return selected_features
286
287
288
289
290 -def ICAP(data, labels, n_select):
291 '''
292 This function implements the interaction capping feature
293 selection algorithm.
294
295 @param data: data in a Numpy array such that len(data) =
296 n_observations, and len(data.transpose()) = n_features
297 @type data: ndarray
298 @param labels: labels represented in a numpy list with
299 n_observations as the number of elements. That is
300 len(labels) = len(data) = n_observations.
301 @type labels: ndarray
302 @param n_select: number of features to select. (REQUIRED)
303 @type n_select: integer
304 @return: the features in the order they were selected.
305 @rtype: list
306 '''
307 data, labels = check_data(data, labels)
308
309
310 n_observations, n_features = data.shape
311 output = np.zeros(n_select)
312
313
314 c_n_observations = c.c_int(n_observations)
315 c_n_select = c.c_int(n_select)
316 c_n_features = c.c_int(n_features)
317
318 libFSToolbox.ICAP.restype = c.POINTER(c.c_double * n_select)
319 features = libFSToolbox.ICAP(c_n_select,
320 c_n_observations,
321 c_n_features,
322 data.ctypes.data_as(c.POINTER(c.c_double)),
323 labels.ctypes.data_as(c.POINTER(c.c_double)),
324 output.ctypes.data_as(c.POINTER(c.c_double))
325 )
326
327
328
329 selected_features = []
330 for i in features.contents:
331
332
333
334 selected_features.append(i - 1)
335
336 return selected_features
337
338
339
340
341
342 -def JMI(data, labels, n_select):
343 '''
344 This function implements the joint mutual information feature
345 selection algorithm.
346
347 @param data: data in a Numpy array such that len(data) =
348 n_observations, and len(data.transpose()) = n_features
349 @type data: ndarray
350 @param labels: labels represented in a numpy list with
351 n_observations as the number of elements. That is
352 len(labels) = len(data) = n_observations.
353 @type labels: ndarray
354 @param n_select: number of features to select. (REQUIRED)
355 @type n_select: integer
356 @return: the features in the order they were selected.
357 @rtype: list
358 '''
359 data, labels = check_data(data, labels)
360
361
362 n_observations, n_features = data.shape
363 output = np.zeros(n_select)
364
365
366 c_n_observations = c.c_int(n_observations)
367 c_n_select = c.c_int(n_select)
368 c_n_features = c.c_int(n_features)
369
370 libFSToolbox.JMI.restype = c.POINTER(c.c_double * n_select)
371 features = libFSToolbox.JMI(c_n_select,
372 c_n_observations,
373 c_n_features,
374 data.ctypes.data_as(c.POINTER(c.c_double)),
375 labels.ctypes.data_as(c.POINTER(c.c_double)),
376 output.ctypes.data_as(c.POINTER(c.c_double))
377 )
378
379
380
381 selected_features = []
382 for i in features.contents:
383
384
385
386 selected_features.append(i - 1)
387
388 return selected_features
389
390
391
392 -def MIFS(data, labels, n_select):
393 '''
394 This function implements the MIFS algorithm.
395 beta = 1; gamma = 0;
396
397 @param data: data in a Numpy array such that len(data) =
398 n_observations, and len(data.transpose()) = n_features
399 @type data: ndarray
400 @param labels: labels represented in a numpy list with
401 n_observations as the number of elements. That is
402 len(labels) = len(data) = n_observations.
403 @type labels: ndarray
404 @param n_select: number of features to select. (REQUIRED)
405 @type n_select: integer
406 @return: the features in the order they were selected.
407 @rtype: list
408 '''
409
410 return BetaGamma(data, labels, n_select, beta=0.0, gamma=0.0)
411
412
413 -def MIM(data, labels, n_select):
414 '''
415 This function implements the MIM algorithm.
416 beta = 0; gamma = 0;
417
418 @param data: data in a Numpy array such that len(data) =
419 n_observations, and len(data.transpose()) = n_features
420 @type data: ndarray
421 @param labels: labels represented in a numpy list with
422 n_observations as the number of elements. That is
423 len(labels) = len(data) = n_observations.
424 @type labels: ndarray
425 @param n_select: number of features to select. (REQUIRED)
426 @type n_select: integer
427 @return: the features in the order they were selected.
428 @rtype: list
429 '''
430 data, labels = check_data(data, labels)
431
432 return BetaGamma(data, labels, n_select, beta=0.0, gamma=0.0)
433
434
435
436 -def mRMR(data, labels, n_select):
437 '''
438 This funciton implements the max-relevance min-redundancy feature
439 selection algorithm.
440
441 @param data: data in a Numpy array such that len(data) =
442 n_observations, and len(data.transpose()) = n_features
443 @type data: ndarray
444 @param labels: labels represented in a numpy list with
445 n_observations as the number of elements. That is
446 len(labels) = len(data) = n_observations.
447 @type labels: ndarray
448 @param n_select: number of features to select. (REQUIRED)
449 @type n_select: integer
450 @return: the features in the order they were selected.
451 @rtype: list
452 '''
453 data, labels = check_data(data, labels)
454
455
456 n_observations, n_features = data.shape
457 output = np.zeros(n_select)
458
459
460 c_n_observations = c.c_int(n_observations)
461 c_n_select = c.c_int(n_select)
462 c_n_features = c.c_int(n_features)
463
464 libFSToolbox.mRMR_D.restype = c.POINTER(c.c_double * n_select)
465 features = libFSToolbox.mRMR_D(c_n_select,
466 c_n_observations,
467 c_n_features,
468 data.ctypes.data_as(c.POINTER(c.c_double)),
469 labels.ctypes.data_as(c.POINTER(c.c_double)),
470 output.ctypes.data_as(c.POINTER(c.c_double))
471 )
472
473
474
475 selected_features = []
476 for i in features.contents:
477
478
479
480 selected_features.append(i - 1)
481
482 return selected_features
483
485 '''
486 Check dimensions of the data and the labels. Raise and exception
487 if there is a problem.
488
489 Data and Labels are automatically cast as doubles before calling the
490 feature selection functions
491
492 @param data: the data
493 @param labels: the labels
494 @return (data, labels): ndarray of floats
495 @rtype: tuple
496 '''
497
498 if isinstance(data, np.ndarray) is False:
499 raise Exception("data must be an numpy ndarray.")
500 if isinstance(labels, np.ndarray) is False:
501 raise Exception("labels must be an numpy ndarray.")
502
503 if len(data) != len(labels):
504 raise Exception("data and labels must be the same length")
505
506 return 1.0*data, 1.0*labels
507