Publisher = {JMLR Workshop and Conference Proceedings},

Title = {A new Q(lambda) with interim forward view and Monte Carlo equivalence},

Url = {http://jmlr.org/proceedings/papers/v32/sutton14.pdf},

Abstract = {Q-learning, the most popular of reinforcement learning algorithms, has always included an extension to eligibility traces to enable more rapid learning and improved asymptotic performance on non-Markov problems. The lambda parameter smoothly shifts on-policy algorithms such as TD(lambda) and Sarsa(lambda) from a pure bootstrapping form (lambda=0) to a pure Monte Carlo form (lambda=1). In off-policy algorithms, including Q(lambda), GQ(lambda), and off-policy LSTD(lambda), the lambda parameter is intended to play the same role, but does not; on every exploratory action these algorithms bootstrap regardless of the value of lambda, and as a result they fail to approximate Monte Carlo learning when lambda=1. It may seem that this is inevitable for any online off-policy algorithm; if updates are made on each step on which the target policy is followed, then how could just the right updates be `un-made' upon deviation from the target policy? In this paper, we introduce a new version of Q(lambda) that does exactly that, without significantly increased algorithmic complexity. En route to our new Q(lambda), we introduce a new derivation technique based on the forward-view/backward-view analysis familiar from TD(lambda) but extended to apply at every time step rather than only at the end of episodes. We apply this technique to derive first a new off-policy version of TD(lambda), called PTD(lambda), and then our new Q(lambda), called PQ(lambda).},

Author = {Rich Sutton and Ashique R. Mahmood and Doina Precup and Hado V. Hasselt},

Editor = {Tony Jebara and Eric P. Xing},

Year = {2014},

Booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML-14)},

Pages = {568-576}

}