feast

+ 1 ''' + 2 The FEAST module provides an interface between the C-library + 3 for feature selection to Python. + 4 + 5 References: + 6 1) G. Brown, A. Pocock, M.-J. Zhao, and M. Lujan, "Conditional + 7 likelihood maximization: A unifying framework for information + 8 theoretic feature selection," Journal of Machine Learning + 9 Research, vol. 13, pp. 27-66, 2012. + 10 + 11 ''' + 12 __author__ = "Calvin Morrison" + 13 __copyright__ = "Copyright 2013, EESI Laboratory" + 14 __credits__ = ["Calvin Morrison", "Gregory Ditzler"] + 15 __license__ = "GPL" + 16 __version__ = "0.2.0" + 17 __maintainer__ = "Calvin Morrison" + 18 __email__ = "mutantturkey@gmail.com" + 19 __status__ = "Release" + 20 + 21 import numpy as np + 22 import ctypes as c + 23 + 24 try: + 25 libFSToolbox = c.CDLL("libFSToolbox.so"); + 26 except: + 27 raise Exception("Error: could not load libFSToolbox.so") + 28 + 29 +

30 -def BetaGamma(data, labels, n_select, beta=1.0, gamma=1.0): +

31 ''' + 32 This algorithm implements conditional mutual information + 33 feature select, such that beta and gamma control the + 34 weight attached to the redundant mutual and conditional + 35 mutual information, respectively. + 36 + 37 @param data: data in a Numpy array such that len(data) = + 38 n_observations, and len(data.transpose()) = n_features + 39 (REQUIRED) + 40 @type data: ndarray + 41 @param labels: labels represented in a numpy list with + 42 n_observations as the number of elements. That is + 43 len(labels) = len(data) = n_observations. + 44 (REQUIRED) + 45 @type labels: ndarray + 46 @param n_select: number of features to select. (REQUIRED) + 47 @type n_select: integer + 48 @param beta: penalty attacted to I(X_j;X_k) + 49 @type beta: float between 0 and 1.0 + 50 @param gamma: positive weight attached to the conditional + 51 redundancy term I(X_k;X_j|Y) + 52 @type gamma: float between 0 and 1.0 + 53 @return: features in the order they were selected. + 54 @rtype: list + 55 ''' + 56 data, labels = check_data(data, labels) + 57 + 58 # python values + 59 n_observations, n_features = data.shape + 60 output = np.zeros(n_select) + 61 + 62 # cast as C types + 63 c_n_observations = c.c_int(n_observations) + 64 c_n_select = c.c_int(n_select) + 65 c_n_features = c.c_int(n_features) + 66 c_beta = c.c_double(beta) + 67 c_gamma = c.c_double(gamma) + 68 + 69 libFSToolbox.BetaGamma.restype = c.POINTER(c.c_double * n_select) + 70 features = libFSToolbox.BetaGamma(c_n_select, + 71 c_n_observations, + 72 c_n_features, + 73 data.ctypes.data_as(c.POINTER(c.c_double)), + 74 labels.ctypes.data_as(c.POINTER(c.c_double)), + 75 output.ctypes.data_as(c.POINTER(c.c_double)), + 76 c_beta, + 77 c_gamma + 78 ) + 79 + 80 # turn our output into a list + 81 selected_features = [] + 82 for i in features.contents: + 83 # recall that feast was implemented with Matlab in mind, so the + 84 # authors assumed the indexing started a one; however, in Python + 85 # the indexing starts at zero. + 86 selected_features.append(i - 1) + 87 + 88 return selected_features +

89 + 90 + 91 +

92 -def CIFE(data, labels, n_select): +

93 ''' + 94 This function implements the Condred feature selection algorithm. + 95 beta = 1; gamma = 1; + 96 + 97 @param data: A Numpy array such that len(data) = + 98 n_observations, and len(data.transpose()) = n_features + 99 @type data: ndarray +100 @param labels: labels represented in a numpy list with +101 n_observations as the number of elements. That is +102 len(labels) = len(data) = n_observations. +103 @type labels: ndarray +104 @param n_select: number of features to select. +105 @type n_select: integer +106 @return selected_features: features in the order they were selected. +107 @rtype: list +108 ''' +109 +110 return BetaGamma(data, labels, n_select, beta=1.0, gamma=1.0) +

111 +112 +113 +114 +

115 -def CMIM(data, labels, n_select): +

116 ''' +117 This function implements the conditional mutual information +118 maximization feature selection algorithm. Note that this +119 implementation does not allow for the weighting of the +120 redundancy terms that BetaGamma will allow you to do. +121 +122 @param data: A Numpy array such that len(data) = +123 n_observations, and len(data.transpose()) = n_features +124 @type data: ndarray +125 @param labels: labels represented in a numpy array with +126 n_observations as the number of elements. That is +127 len(labels) = len(data) = n_observations. +128 @type labels: ndarray +129 @param n_select: number of features to select. +130 @type n_select: integer +131 @return: features in the order that they were selected. +132 @rtype: list +133 ''' +134 data, labels = check_data(data, labels) +135 +136 # python values +137 n_observations, n_features = data.shape +138 output = np.zeros(n_select) +139 +140 # cast as C types +141 c_n_observations = c.c_int(n_observations) +142 c_n_select = c.c_int(n_select) +143 c_n_features = c.c_int(n_features) +144 +145 libFSToolbox.CMIM.restype = c.POINTER(c.c_double * n_select) +146 features = libFSToolbox.CMIM(c_n_select, +147 c_n_observations, +148 c_n_features, +149 data.ctypes.data_as(c.POINTER(c.c_double)), +150 labels.ctypes.data_as(c.POINTER(c.c_double)), +151 output.ctypes.data_as(c.POINTER(c.c_double)) +152 ) +153 +154 +155 # turn our output into a list +156 selected_features = [] +157 for i in features.contents: +158 # recall that feast was implemented with Matlab in mind, so the +159 # authors assumed the indexing started a one; however, in Python +160 # the indexing starts at zero. +161 selected_features.append(i - 1) +162 +163 return selected_features +

164 +165 +166 +

167 -def CondMI(data, labels, n_select): +

168 ''' +169 This function implements the conditional mutual information +170 maximization feature selection algorithm. +171 +172 @param data: data in a Numpy array such that len(data) = n_observations, +173 and len(data.transpose()) = n_features +174 @type data: ndarray +175 @param labels: represented in a numpy list with +176 n_observations as the number of elements. That is +177 len(labels) = len(data) = n_observations. +178 @type labels: ndarray +179 @param n_select: number of features to select. +180 @type n_select: integer +181 @return: features in the order they were selected. +182 @rtype list +183 ''' +184 data, labels = check_data(data, labels) +185 +186 # python values +187 n_observations, n_features = data.shape +188 output = np.zeros(n_select) +189 +190 # cast as C types +191 c_n_observations = c.c_int(n_observations) +192 c_n_select = c.c_int(n_select) +193 c_n_features = c.c_int(n_features) +194 +195 libFSToolbox.CondMI.restype = c.POINTER(c.c_double * n_select) +196 features = libFSToolbox.CondMI(c_n_select, +197 c_n_observations, +198 c_n_features, +199 data.ctypes.data_as(c.POINTER(c.c_double)), +200 labels.ctypes.data_as(c.POINTER(c.c_double)), +201 output.ctypes.data_as(c.POINTER(c.c_double)) +202 ) +203 +204 +205 # turn our output into a list +206 selected_features = [] +207 for i in features.contents: +208 # recall that feast was implemented with Matlab in mind, so the +209 # authors assumed the indexing started a one; however, in Python +210 # the indexing starts at zero. +211 selected_features.append(i - 1) +212 +213 return selected_features +

214 +215 +

216 -def Condred(data, labels, n_select): +

217 ''' +218 This function implements the Condred feature selection algorithm. +219 beta = 0; gamma = 1; +220 +221 @param data: data in a Numpy array such that len(data) = +222 n_observations, and len(data.transpose()) = n_features +223 @type data: ndarray +224 @param labels: labels represented in a numpy list with +225 n_observations as the number of elements. That is +226 len(labels) = len(data) = n_observations. +227 @type labels: ndarray +228 @param n_select: number of features to select. +229 @type n_select: integer +230 @return: the features in the order they were selected. +231 @rtype: list +232 ''' +233 data, labels = check_data(data, labels) +234 +235 return BetaGamma(data, labels, n_select, beta=0.0, gamma=1.0) +

236 +237 +238 +

239 -def DISR(data, labels, n_select): +

240 ''' +241 This function implements the double input symmetrical relevance +242 feature selection algorithm. +243 +244 @param data: data in a Numpy array such that len(data) = +245 n_observations, and len(data.transpose()) = n_features +246 @type data: ndarray +247 @param labels: labels represented in a numpy list with +248 n_observations as the number of elements. That is +249 len(labels) = len(data) = n_observations. +250 @type labels: ndarray +251 @param n_select: number of features to select. (REQUIRED) +252 @type n_select: integer +253 @return: the features in the order they were selected. +254 @rtype: list +255 ''' +256 data, labels = check_data(data, labels) +257 +258 # python values +259 n_observations, n_features = data.shape +260 output = np.zeros(n_select) +261 +262 # cast as C types +263 c_n_observations = c.c_int(n_observations) +264 c_n_select = c.c_int(n_select) +265 c_n_features = c.c_int(n_features) +266 +267 libFSToolbox.DISR.restype = c.POINTER(c.c_double * n_select) +268 features = libFSToolbox.DISR(c_n_select, +269 c_n_observations, +270 c_n_features, +271 data.ctypes.data_as(c.POINTER(c.c_double)), +272 labels.ctypes.data_as(c.POINTER(c.c_double)), +273 output.ctypes.data_as(c.POINTER(c.c_double)) +274 ) +275 +276 +277 # turn our output into a list +278 selected_features = [] +279 for i in features.contents: +280 # recall that feast was implemented with Matlab in mind, so the +281 # authors assumed the indexing started a one; however, in Python +282 # the indexing starts at zero. +283 selected_features.append(i - 1) +284 +285 return selected_features +

286 +287 +288 +289 +

290 -def ICAP(data, labels, n_select): +

291 ''' +292 This function implements the interaction capping feature +293 selection algorithm. +294 +295 @param data: data in a Numpy array such that len(data) = +296 n_observations, and len(data.transpose()) = n_features +297 @type data: ndarray +298 @param labels: labels represented in a numpy list with +299 n_observations as the number of elements. That is +300 len(labels) = len(data) = n_observations. +301 @type labels: ndarray +302 @param n_select: number of features to select. (REQUIRED) +303 @type n_select: integer +304 @return: the features in the order they were selected. +305 @rtype: list +306 ''' +307 data, labels = check_data(data, labels) +308 +309 # python values +310 n_observations, n_features = data.shape +311 output = np.zeros(n_select) +312 +313 # cast as C types +314 c_n_observations = c.c_int(n_observations) +315 c_n_select = c.c_int(n_select) +316 c_n_features = c.c_int(n_features) +317 +318 libFSToolbox.ICAP.restype = c.POINTER(c.c_double * n_select) +319 features = libFSToolbox.ICAP(c_n_select, +320 c_n_observations, +321 c_n_features, +322 data.ctypes.data_as(c.POINTER(c.c_double)), +323 labels.ctypes.data_as(c.POINTER(c.c_double)), +324 output.ctypes.data_as(c.POINTER(c.c_double)) +325 ) +326 +327 +328 # turn our output into a list +329 selected_features = [] +330 for i in features.contents: +331 # recall that feast was implemented with Matlab in mind, so the +332 # authors assumed the indexing started a one; however, in Python +333 # the indexing starts at zero. +334 selected_features.append(i - 1) +335 +336 return selected_features +

337 +338 +339 +340 +341 +

342 -def JMI(data, labels, n_select): +

343 ''' +344 This function implements the joint mutual information feature +345 selection algorithm. +346 +347 @param data: data in a Numpy array such that len(data) = +348 n_observations, and len(data.transpose()) = n_features +349 @type data: ndarray +350 @param labels: labels represented in a numpy list with +351 n_observations as the number of elements. That is +352 len(labels) = len(data) = n_observations. +353 @type labels: ndarray +354 @param n_select: number of features to select. (REQUIRED) +355 @type n_select: integer +356 @return: the features in the order they were selected. +357 @rtype: list +358 ''' +359 data, labels = check_data(data, labels) +360 +361 # python values +362 n_observations, n_features = data.shape +363 output = np.zeros(n_select) +364 +365 # cast as C types +366 c_n_observations = c.c_int(n_observations) +367 c_n_select = c.c_int(n_select) +368 c_n_features = c.c_int(n_features) +369 +370 libFSToolbox.JMI.restype = c.POINTER(c.c_double * n_select) +371 features = libFSToolbox.JMI(c_n_select, +372 c_n_observations, +373 c_n_features, +374 data.ctypes.data_as(c.POINTER(c.c_double)), +375 labels.ctypes.data_as(c.POINTER(c.c_double)), +376 output.ctypes.data_as(c.POINTER(c.c_double)) +377 ) +378 +379 +380 # turn our output into a list +381 selected_features = [] +382 for i in features.contents: +383 # recall that feast was implemented with Matlab in mind, so the +384 # authors assumed the indexing started a one; however, in Python +385 # the indexing starts at zero. +386 selected_features.append(i - 1) +387 +388 return selected_features +

389 +390 +391 +

392 -def MIFS(data, labels, n_select): +

393 ''' +394 This function implements the MIFS algorithm. +395 beta = 1; gamma = 0; +396 +397 @param data: data in a Numpy array such that len(data) = +398 n_observations, and len(data.transpose()) = n_features +399 @type data: ndarray +400 @param labels: labels represented in a numpy list with +401 n_observations as the number of elements. That is +402 len(labels) = len(data) = n_observations. +403 @type labels: ndarray +404 @param n_select: number of features to select. (REQUIRED) +405 @type n_select: integer +406 @return: the features in the order they were selected. +407 @rtype: list +408 ''' +409 +410 return BetaGamma(data, labels, n_select, beta=0.0, gamma=0.0) +

411 +412 +

413 -def MIM(data, labels, n_select): +

414 ''' +415 This function implements the MIM algorithm. +416 beta = 0; gamma = 0; +417 +418 @param data: data in a Numpy array such that len(data) = +419 n_observations, and len(data.transpose()) = n_features +420 @type data: ndarray +421 @param labels: labels represented in a numpy list with +422 n_observations as the number of elements. That is +423 len(labels) = len(data) = n_observations. +424 @type labels: ndarray +425 @param n_select: number of features to select. (REQUIRED) +426 @type n_select: integer +427 @return: the features in the order they were selected. +428 @rtype: list +429 ''' +430 data, labels = check_data(data, labels) +431 +432 return BetaGamma(data, labels, n_select, beta=0.0, gamma=0.0) +

433 +434 +435 +

436 -def mRMR(data, labels, n_select): +

437 ''' +438 This funciton implements the max-relevance min-redundancy feature +439 selection algorithm. +440 +441 @param data: data in a Numpy array such that len(data) = +442 n_observations, and len(data.transpose()) = n_features +443 @type data: ndarray +444 @param labels: labels represented in a numpy list with +445 n_observations as the number of elements. That is +446 len(labels) = len(data) = n_observations. +447 @type labels: ndarray +448 @param n_select: number of features to select. (REQUIRED) +449 @type n_select: integer +450 @return: the features in the order they were selected. +451 @rtype: list +452 ''' +453 data, labels = check_data(data, labels) +454 +455 # python values +456 n_observations, n_features = data.shape +457 output = np.zeros(n_select) +458 +459 # cast as C types +460 c_n_observations = c.c_int(n_observations) +461 c_n_select = c.c_int(n_select) +462 c_n_features = c.c_int(n_features) +463 +464 libFSToolbox.mRMR_D.restype = c.POINTER(c.c_double * n_select) +465 features = libFSToolbox.mRMR_D(c_n_select, +466 c_n_observations, +467 c_n_features, +468 data.ctypes.data_as(c.POINTER(c.c_double)), +469 labels.ctypes.data_as(c.POINTER(c.c_double)), +470 output.ctypes.data_as(c.POINTER(c.c_double)) +471 ) +472 +473 +474 # turn our output into a list +475 selected_features = [] +476 for i in features.contents: +477 # recall that feast was implemented with Matlab in mind, so the +478 # authors assumed the indexing started a one; however, in Python +479 # the indexing starts at zero. +480 selected_features.append(i - 1) +481 +482 return selected_features +

483 +

484 -def check_data(data, labels): +

485 ''' +486 Check dimensions of the data and the labels. Raise and exception +487 if there is a problem. +488 +489 Data and Labels are automatically cast as doubles before calling the +490 feature selection functions +491 +492 @param data: the data +493 @param labels: the labels +494 @return (data, labels): ndarray of floats +495 @rtype: tuple +496 ''' +497 +498 if isinstance(data, np.ndarray) is False: +499 raise Exception("data must be an numpy ndarray.") +500 if isinstance(labels, np.ndarray) is False: +501 raise Exception("labels must be an numpy ndarray.") +502 +503 if len(data) != len(labels): +504 raise Exception("data and labels must be the same length") +505 +506 return 1.0*data, 1.0*labels +

507 +

Source Code for Module feast