Publisher = {JMLR Workshop and Conference Proceedings},

Title = {Coding for Random Projections},

Url = {http://jmlr.org/proceedings/papers/v32/lie14.pdf},

Abstract = {The method of random projections has become popular for large-scale applications in statistical learning, information retrieval, bio-informatics and other applications. Using a well-designed \textbf{coding} scheme for the projected data, which determines the number of bits needed for each projected value and how to allocate these bits, can significantly improve the effectiveness of the algorithm, in storage cost as well as computational speed. In this paper, we study a number of simple coding schemes, focusing on the task of similarity estimation and on an application to training linear classifiers. We demonstrate that \textbf{uniform quantization} outperforms the standard and influential method~\cite{Proc:Datar_SCG04}, which used a {\em window-and-random offset} scheme. Indeed, we argue that in many cases coding with just a small number of bits suffices. Furthermore, we also develop a \textbf{non-uniform 2-bit} coding scheme that generally performs well in practice, as confirmed by our experiments on training linear support vector machines (SVM). Proofs and additional experiments are available at {\em arXiv:1308.2218}. In the context of using coded random projections for \textbf{approximate near neighbor search} by building hash tables ({\em arXiv:1403.8144})~\cite{Report:RPCodeLSH2014}, we show that the step of random offset in~\cite{Proc:Datar_SCG04} is again not needed and may hurt the performance. Furthermore, we show that, unless the target similarity level is high, it usually suffices to use only 1 or 2 bits to code each hashed value for this task. Section~\ref{sec_LSH} presents some experimental results for LSH.},

Author = {Ping Li and Michael Mitzenmacher and Anshumali Shrivastava},

Editor = {Tony Jebara and Eric P. Xing},

Year = {2014},

Booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML-14)},

Pages = {676-684}

}