feast

1 ''' 2 The FEAST module provides an interface between the C-library 3 for feature selection to Python. 4 5 References: 6 1) G. Brown, A. Pocock, M.-J. Zhao, and M. Lujan, "Conditional 7 likelihood maximization: A unifying framework for information 8 theoretic feature selection," Journal of Machine Learning 9 Research, vol. 13, pp. 27-66, 2012. 10 11 ''' 12 __author__ = "Calvin Morrison" 13 __copyright__ = "Copyright 2013, EESI Laboratory" 14 __credits__ = ["Calvin Morrison", "Gregory Ditzler"] 15 __license__ = "GPL" 16 __version__ = "0.2.0" 17 __maintainer__ = "Calvin Morrison" 18 __email__ = "mutantturkey@gmail.com" 19 __status__ = "Release" 20 21 import numpy as np 22 import ctypes as c 23 24 try: 25 libFSToolbox = c.CDLL("libFSToolbox.so"); 26 except: 27 raise Exception("Error: could not load libFSToolbox.so") 28 29

30 -def BetaGamma(data, labels, n_select, beta=1.0, gamma=1.0):

31 ''' 32 This algorithm implements conditional mutual information 33 feature select, such that beta and gamma control the 34 weight attached to the redundant mutual and conditional 35 mutual information, respectively. 36 37 @param data: data in a Numpy array such that len(data) = 38 n_observations, and len(data.transpose()) = n_features 39 (REQUIRED) 40 @type data: ndarray 41 @param labels: labels represented in a numpy list with 42 n_observations as the number of elements. That is 43 len(labels) = len(data) = n_observations. 44 (REQUIRED) 45 @type labels: ndarray 46 @param n_select: number of features to select. (REQUIRED) 47 @type n_select: integer 48 @param beta: penalty attacted to I(X_j;X_k) 49 @type beta: float between 0 and 1.0 50 @param gamma: positive weight attached to the conditional 51 redundancy term I(X_k;X_j|Y) 52 @type gamma: float between 0 and 1.0 53 @return: features in the order they were selected. 54 @rtype: list 55 ''' 56 data, labels = check_data(data, labels) 57 58 # python values 59 n_observations, n_features = data.shape 60 output = np.zeros(n_select) 61 62 # cast as C types 63 c_n_observations = c.c_int(n_observations) 64 c_n_select = c.c_int(n_select) 65 c_n_features = c.c_int(n_features) 66 c_beta = c.c_double(beta) 67 c_gamma = c.c_double(gamma) 68 69 libFSToolbox.BetaGamma.restype = c.POINTER(c.c_double * n_select) 70 features = libFSToolbox.BetaGamma(c_n_select, 71 c_n_observations, 72 c_n_features, 73 data.ctypes.data_as(c.POINTER(c.c_double)), 74 labels.ctypes.data_as(c.POINTER(c.c_double)), 75 output.ctypes.data_as(c.POINTER(c.c_double)), 76 c_beta, 77 c_gamma 78 ) 79 80 # turn our output into a list 81 selected_features = [] 82 for i in features.contents: 83 # recall that feast was implemented with Matlab in mind, so the 84 # authors assumed the indexing started a one; however, in Python 85 # the indexing starts at zero. 86 selected_features.append(i - 1) 87 88 return selected_features

89 90 91

92 -def CIFE(data, labels, n_select):

93 ''' 94 This function implements the Condred feature selection algorithm. 95 beta = 1; gamma = 1; 96 97 @param data: A Numpy array such that len(data) = 98 n_observations, and len(data.transpose()) = n_features 99 @type data: ndarray 100 @param labels: labels represented in a numpy list with 101 n_observations as the number of elements. That is 102 len(labels) = len(data) = n_observations. 103 @type labels: ndarray 104 @param n_select: number of features to select. 105 @type n_select: integer 106 @return selected_features: features in the order they were selected. 107 @rtype: list 108 ''' 109 110 return BetaGamma(data, labels, n_select, beta=1.0, gamma=1.0)

111 112 113 114

115 -def CMIM(data, labels, n_select):

116 ''' 117 This function implements the conditional mutual information 118 maximization feature selection algorithm. Note that this 119 implementation does not allow for the weighting of the 120 redundancy terms that BetaGamma will allow you to do. 121 122 @param data: A Numpy array such that len(data) = 123 n_observations, and len(data.transpose()) = n_features 124 @type data: ndarray 125 @param labels: labels represented in a numpy array with 126 n_observations as the number of elements. That is 127 len(labels) = len(data) = n_observations. 128 @type labels: ndarray 129 @param n_select: number of features to select. 130 @type n_select: integer 131 @return: features in the order that they were selected. 132 @rtype: list 133 ''' 134 data, labels = check_data(data, labels) 135 136 # python values 137 n_observations, n_features = data.shape 138 output = np.zeros(n_select) 139 140 # cast as C types 141 c_n_observations = c.c_int(n_observations) 142 c_n_select = c.c_int(n_select) 143 c_n_features = c.c_int(n_features) 144 145 libFSToolbox.CMIM.restype = c.POINTER(c.c_double * n_select) 146 features = libFSToolbox.CMIM(c_n_select, 147 c_n_observations, 148 c_n_features, 149 data.ctypes.data_as(c.POINTER(c.c_double)), 150 labels.ctypes.data_as(c.POINTER(c.c_double)), 151 output.ctypes.data_as(c.POINTER(c.c_double)) 152 ) 153 154 155 # turn our output into a list 156 selected_features = [] 157 for i in features.contents: 158 # recall that feast was implemented with Matlab in mind, so the 159 # authors assumed the indexing started a one; however, in Python 160 # the indexing starts at zero. 161 selected_features.append(i - 1) 162 163 return selected_features

164 165 166

167 -def CondMI(data, labels, n_select):

168 ''' 169 This function implements the conditional mutual information 170 maximization feature selection algorithm. 171 172 @param data: data in a Numpy array such that len(data) = n_observations, 173 and len(data.transpose()) = n_features 174 @type data: ndarray 175 @param labels: represented in a numpy list with 176 n_observations as the number of elements. That is 177 len(labels) = len(data) = n_observations. 178 @type labels: ndarray 179 @param n_select: number of features to select. 180 @type n_select: integer 181 @return: features in the order they were selected. 182 @rtype list 183 ''' 184 data, labels = check_data(data, labels) 185 186 # python values 187 n_observations, n_features = data.shape 188 output = np.zeros(n_select) 189 190 # cast as C types 191 c_n_observations = c.c_int(n_observations) 192 c_n_select = c.c_int(n_select) 193 c_n_features = c.c_int(n_features) 194 195 libFSToolbox.CondMI.restype = c.POINTER(c.c_double * n_select) 196 features = libFSToolbox.CondMI(c_n_select, 197 c_n_observations, 198 c_n_features, 199 data.ctypes.data_as(c.POINTER(c.c_double)), 200 labels.ctypes.data_as(c.POINTER(c.c_double)), 201 output.ctypes.data_as(c.POINTER(c.c_double)) 202 ) 203 204 205 # turn our output into a list 206 selected_features = [] 207 for i in features.contents: 208 # recall that feast was implemented with Matlab in mind, so the 209 # authors assumed the indexing started a one; however, in Python 210 # the indexing starts at zero. 211 selected_features.append(i - 1) 212 213 return selected_features

214 215

216 -def Condred(data, labels, n_select):

217 ''' 218 This function implements the Condred feature selection algorithm. 219 beta = 0; gamma = 1; 220 221 @param data: data in a Numpy array such that len(data) = 222 n_observations, and len(data.transpose()) = n_features 223 @type data: ndarray 224 @param labels: labels represented in a numpy list with 225 n_observations as the number of elements. That is 226 len(labels) = len(data) = n_observations. 227 @type labels: ndarray 228 @param n_select: number of features to select. 229 @type n_select: integer 230 @return: the features in the order they were selected. 231 @rtype: list 232 ''' 233 data, labels = check_data(data, labels) 234 235 return BetaGamma(data, labels, n_select, beta=0.0, gamma=1.0)

236 237 238

239 -def DISR(data, labels, n_select):

240 ''' 241 This function implements the double input symmetrical relevance 242 feature selection algorithm. 243 244 @param data: data in a Numpy array such that len(data) = 245 n_observations, and len(data.transpose()) = n_features 246 @type data: ndarray 247 @param labels: labels represented in a numpy list with 248 n_observations as the number of elements. That is 249 len(labels) = len(data) = n_observations. 250 @type labels: ndarray 251 @param n_select: number of features to select. (REQUIRED) 252 @type n_select: integer 253 @return: the features in the order they were selected. 254 @rtype: list 255 ''' 256 data, labels = check_data(data, labels) 257 258 # python values 259 n_observations, n_features = data.shape 260 output = np.zeros(n_select) 261 262 # cast as C types 263 c_n_observations = c.c_int(n_observations) 264 c_n_select = c.c_int(n_select) 265 c_n_features = c.c_int(n_features) 266 267 libFSToolbox.DISR.restype = c.POINTER(c.c_double * n_select) 268 features = libFSToolbox.DISR(c_n_select, 269 c_n_observations, 270 c_n_features, 271 data.ctypes.data_as(c.POINTER(c.c_double)), 272 labels.ctypes.data_as(c.POINTER(c.c_double)), 273 output.ctypes.data_as(c.POINTER(c.c_double)) 274 ) 275 276 277 # turn our output into a list 278 selected_features = [] 279 for i in features.contents: 280 # recall that feast was implemented with Matlab in mind, so the 281 # authors assumed the indexing started a one; however, in Python 282 # the indexing starts at zero. 283 selected_features.append(i - 1) 284 285 return selected_features

286 287 288 289

290 -def ICAP(data, labels, n_select):

291 ''' 292 This function implements the interaction capping feature 293 selection algorithm. 294 295 @param data: data in a Numpy array such that len(data) = 296 n_observations, and len(data.transpose()) = n_features 297 @type data: ndarray 298 @param labels: labels represented in a numpy list with 299 n_observations as the number of elements. That is 300 len(labels) = len(data) = n_observations. 301 @type labels: ndarray 302 @param n_select: number of features to select. (REQUIRED) 303 @type n_select: integer 304 @return: the features in the order they were selected. 305 @rtype: list 306 ''' 307 data, labels = check_data(data, labels) 308 309 # python values 310 n_observations, n_features = data.shape 311 output = np.zeros(n_select) 312 313 # cast as C types 314 c_n_observations = c.c_int(n_observations) 315 c_n_select = c.c_int(n_select) 316 c_n_features = c.c_int(n_features) 317 318 libFSToolbox.ICAP.restype = c.POINTER(c.c_double * n_select) 319 features = libFSToolbox.ICAP(c_n_select, 320 c_n_observations, 321 c_n_features, 322 data.ctypes.data_as(c.POINTER(c.c_double)), 323 labels.ctypes.data_as(c.POINTER(c.c_double)), 324 output.ctypes.data_as(c.POINTER(c.c_double)) 325 ) 326 327 328 # turn our output into a list 329 selected_features = [] 330 for i in features.contents: 331 # recall that feast was implemented with Matlab in mind, so the 332 # authors assumed the indexing started a one; however, in Python 333 # the indexing starts at zero. 334 selected_features.append(i - 1) 335 336 return selected_features

337 338 339 340 341

342 -def JMI(data, labels, n_select):

343 ''' 344 This function implements the joint mutual information feature 345 selection algorithm. 346 347 @param data: data in a Numpy array such that len(data) = 348 n_observations, and len(data.transpose()) = n_features 349 @type data: ndarray 350 @param labels: labels represented in a numpy list with 351 n_observations as the number of elements. That is 352 len(labels) = len(data) = n_observations. 353 @type labels: ndarray 354 @param n_select: number of features to select. (REQUIRED) 355 @type n_select: integer 356 @return: the features in the order they were selected. 357 @rtype: list 358 ''' 359 data, labels = check_data(data, labels) 360 361 # python values 362 n_observations, n_features = data.shape 363 output = np.zeros(n_select) 364 365 # cast as C types 366 c_n_observations = c.c_int(n_observations) 367 c_n_select = c.c_int(n_select) 368 c_n_features = c.c_int(n_features) 369 370 libFSToolbox.JMI.restype = c.POINTER(c.c_double * n_select) 371 features = libFSToolbox.JMI(c_n_select, 372 c_n_observations, 373 c_n_features, 374 data.ctypes.data_as(c.POINTER(c.c_double)), 375 labels.ctypes.data_as(c.POINTER(c.c_double)), 376 output.ctypes.data_as(c.POINTER(c.c_double)) 377 ) 378 379 380 # turn our output into a list 381 selected_features = [] 382 for i in features.contents: 383 # recall that feast was implemented with Matlab in mind, so the 384 # authors assumed the indexing started a one; however, in Python 385 # the indexing starts at zero. 386 selected_features.append(i - 1) 387 388 return selected_features

389 390 391

392 -def MIFS(data, labels, n_select):

393 ''' 394 This function implements the MIFS algorithm. 395 beta = 1; gamma = 0; 396 397 @param data: data in a Numpy array such that len(data) = 398 n_observations, and len(data.transpose()) = n_features 399 @type data: ndarray 400 @param labels: labels represented in a numpy list with 401 n_observations as the number of elements. That is 402 len(labels) = len(data) = n_observations. 403 @type labels: ndarray 404 @param n_select: number of features to select. (REQUIRED) 405 @type n_select: integer 406 @return: the features in the order they were selected. 407 @rtype: list 408 ''' 409 410 return BetaGamma(data, labels, n_select, beta=0.0, gamma=0.0)

411 412

413 -def MIM(data, labels, n_select):

414 ''' 415 This function implements the MIM algorithm. 416 beta = 0; gamma = 0; 417 418 @param data: data in a Numpy array such that len(data) = 419 n_observations, and len(data.transpose()) = n_features 420 @type data: ndarray 421 @param labels: labels represented in a numpy list with 422 n_observations as the number of elements. That is 423 len(labels) = len(data) = n_observations. 424 @type labels: ndarray 425 @param n_select: number of features to select. (REQUIRED) 426 @type n_select: integer 427 @return: the features in the order they were selected. 428 @rtype: list 429 ''' 430 data, labels = check_data(data, labels) 431 432 return BetaGamma(data, labels, n_select, beta=0.0, gamma=0.0)

433 434 435

436 -def mRMR(data, labels, n_select):

437 ''' 438 This funciton implements the max-relevance min-redundancy feature 439 selection algorithm. 440 441 @param data: data in a Numpy array such that len(data) = 442 n_observations, and len(data.transpose()) = n_features 443 @type data: ndarray 444 @param labels: labels represented in a numpy list with 445 n_observations as the number of elements. That is 446 len(labels) = len(data) = n_observations. 447 @type labels: ndarray 448 @param n_select: number of features to select. (REQUIRED) 449 @type n_select: integer 450 @return: the features in the order they were selected. 451 @rtype: list 452 ''' 453 data, labels = check_data(data, labels) 454 455 # python values 456 n_observations, n_features = data.shape 457 output = np.zeros(n_select) 458 459 # cast as C types 460 c_n_observations = c.c_int(n_observations) 461 c_n_select = c.c_int(n_select) 462 c_n_features = c.c_int(n_features) 463 464 libFSToolbox.mRMR_D.restype = c.POINTER(c.c_double * n_select) 465 features = libFSToolbox.mRMR_D(c_n_select, 466 c_n_observations, 467 c_n_features, 468 data.ctypes.data_as(c.POINTER(c.c_double)), 469 labels.ctypes.data_as(c.POINTER(c.c_double)), 470 output.ctypes.data_as(c.POINTER(c.c_double)) 471 ) 472 473 474 # turn our output into a list 475 selected_features = [] 476 for i in features.contents: 477 # recall that feast was implemented with Matlab in mind, so the 478 # authors assumed the indexing started a one; however, in Python 479 # the indexing starts at zero. 480 selected_features.append(i - 1) 481 482 return selected_features

483

484 -def check_data(data, labels):

485 ''' 486 Check dimensions of the data and the labels. Raise and exception 487 if there is a problem. 488 489 Data and Labels are automatically cast as doubles before calling the 490 feature selection functions 491 492 @param data: the data 493 @param labels: the labels 494 @return (data, labels): ndarray of floats 495 @rtype: tuple 496 ''' 497 498 if isinstance(data, np.ndarray) is False: 499 raise Exception("data must be an numpy ndarray.") 500 if isinstance(labels, np.ndarray) is False: 501 raise Exception("labels must be an numpy ndarray.") 502 503 if len(data) != len(labels): 504 raise Exception("data and labels must be the same length") 505 506 return 1.0*data, 1.0*labels

507

Source Code for Module feast