Search Machine Learning Repository: @inproceedings{icml2014c2_jawanpuria14,
    Publisher = {JMLR Workshop and Conference Proceedings},
    Title = {On p-norm Path Following in Multiple Kernel Learning for Non-linear Feature Selection},
    Url = {http://jmlr.org/proceedings/papers/v32/jawanpuria14.pdf},
    Abstract = {Our objective is to develop formulations and algorithms for efficiently computing the feature selection path -- i.e. the variation in classification accuracy as the fraction of selected features is varied from null to unity. Multiple Kernel Learning subject to $l_{p\geq1}$ regularization ($l_{p}$-MKL) has been demonstrated to be one of the most effective techniques for non-linear feature selection. However, state-of-the-art $l_p$-MKL algorithms are too computationally expensive to be invoked thousands of times to determine the entire path. We propose a novel conjecture which states that, for certain $l_p$-MKL formulations, the number of features selected in the optimal solution monotonically decreases as $p$ is decreased from an initial value to unity. We prove the conjecture, for a generic family of kernel target alignment based formulations, and show that the feature weights themselves decay (grow) monotonically once they are below (above) a certain threshold at optimality. This allows us to develop a path following algorithm that systematically generates optimal feature sets of decreasing size. The proposed algorithm sets certain feature weights directly to zero for potentially large intervals of $p$ thereby reducing optimization costs while simultaneously providing approximation guarantees. We empirically demonstrate that our formulation can lead to classification accuracies which are as much as 10\% higher on benchmark data sets not only as compared to other $l_p$-MKL formulations and uniform kernel baselines but also leading feature selection methods. We further demonstrate that our algorithm reduces training time significantly over other path following algorithms and state-of-the-art $l_p$-MKL optimizers such as SMO-MKL. In particular, we generate the entire feature selection path for data sets with a hundred thousand features in approximately half an hour on standard hardware.},
    Author = {Pratik Jawanpuria and Manik Varma and Saketha Nath},
    Editor = {Tony Jebara and Eric P. Xing},
    Year = {2014},
    Booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML-14)},
    Pages = {118-126}
   }