@article{TVP_2019_64_1_a3,
author = {D. B. Rokhlin},
title = {$Q$-learning in a stochastic {Stackelberg} game between an uninformed leader and a naive follower},
journal = {Teori\^a vero\^atnostej i ee primeneni\^a},
pages = {53--74},
year = {2019},
volume = {64},
number = {1},
language = {ru},
url = {http://geodesic.mathdoc.fr/item/TVP_2019_64_1_a3/}
}
TY - JOUR AU - D. B. Rokhlin TI - $Q$-learning in a stochastic Stackelberg game between an uninformed leader and a naive follower JO - Teoriâ veroâtnostej i ee primeneniâ PY - 2019 SP - 53 EP - 74 VL - 64 IS - 1 UR - http://geodesic.mathdoc.fr/item/TVP_2019_64_1_a3/ LA - ru ID - TVP_2019_64_1_a3 ER -
D. B. Rokhlin. $Q$-learning in a stochastic Stackelberg game between an uninformed leader and a naive follower. Teoriâ veroâtnostej i ee primeneniâ, Tome 64 (2019) no. 1, pp. 53-74. http://geodesic.mathdoc.fr/item/TVP_2019_64_1_a3/
[1] E. Behrends, Introduction to Markov chains. With special emphasis on rapid mixing, Adv. Lectures Math., Friedr. Vieweg Sohn, Braunschweig, 2000, x+232 pp. | DOI | MR | Zbl
[2] D. P. Bertsekas, J. N. Tsitsiklis, Neuro-dynamic programming, Athena Scientific, Belmont, MA, 1996, xiii+491 pp. | Zbl
[3] M. Bowling, M. Veloso, “Multiagent learning using a variable learning rate”, Artificial Intelligence, 136:2 (2002), 215–250 | DOI | MR | Zbl
[4] M. Breton, A. Alj, A. Haurie, “Sequential Stackelberg equilibria in two-person games”, J. Optim. Theory Appl., 59:1 (1988), 71–97 | DOI | MR | Zbl
[5] L. Buşoniu, R. Babuŝka, B. De Schutter, “A comprehensive survey of multiagent reinforcement learning”, IEEE Trans. Systems Man Cybernet., Part C (Appl. Rev.), 38:2 (2008), 156–172 | DOI
[6] A. Gosavi, “Boundedness of iterates in $Q$-learning”, Systems Control Lett., 55:4 (2006), 347–349 | DOI | MR | Zbl
[7] A. Greenwald, K. Hall, “Correlated $Q$-learning”, Proceedings of the twentieth international conference on machine learning (ICML 2003) (Washington, DC, 2003), AAAI Press, Menlo Park, CA, 2004, 242–249
[8] J. Hu, M. P. Wellman, “Multiagent reinforcement learning: theoretical framework and an algorithm”, Proceedings of the fifteenth international conference on machine learning (ICML 1998) (Madison, 1998), Morgan Kaufmann Publishers Inc., San Francisco, CA, 1998, 242–250
[9] T. Jaakkola, M. I. Jordan, S. P. Singh, “Convergence of stochastic iterative dynamic programming algorithms”, Advances in neural information processing systems 6 (NIPS 1993) (Denver, 1993), Morgan Kaufmann Publishers Inc., San Francisco, CA, 1994, 703–710 https://papers.nips.cc/paper/764-convergence-of-stochastic-iterative-dynamic-programming-algorithms
[10] V. Könönen, “Asymmetric multiagent reinforcement learning”, Proceedings of the IEEE/WIC international conference on intelligent agent technology (IAT 2003) (Halifax, NS, 2003), IEEE Computer Soc., Los Alamitos, CA, 2003, 336–342 | DOI
[11] V. Könönen, “Asymmetric multiagent reinforcement learning”, Web Intelligence and Agent Systems (WIAS), 2:2 (2004), 105–121
[12] G. J. Laurent, L. Matignon, N. Le Fort-Piat, “The world of independent learners is not Markovian”, Internat. J. Knowledge-Based and Intelligent Engineering Systems, 15:1 (2011), 55–64 | DOI
[13] O. Hernández-Lerma, J. B. Lasserre, Discrete-time Markov control processes. Basic optimality criteria, Appl. Math. (N. Y.), 30, Springer-Verlag, New York, 1996, xiv+216 pp. | DOI | MR | Zbl
[14] M. L. Littman, “Markov games as a framework for multi-agent reinforcement learning”, Machine learning, Proceedings of the eleventh international conference (ICML 1994) (New Brunswick, NJ, 1994), Morgan Kaufmann Publishers Inc., San Francisco, CA, 1994, 157–163
[15] M. L. Littman, “Value-function reinforcement learning in Markov games”, Cognitive Systems Research, 2:1 (2001), 55–66 | DOI
[16] P.-A. Meyer, Martingales and stochastic integrals I, Lecture Notes in Math., 284, Springer-Verlag, Berlin–New York, 1972, vi+89 pp. | DOI | MR | Zbl
[17] D. B. Rokhlin, Robbins–Monro conditions for persistent exploration learning strategies, 2018, 9 pp., arXiv: 1808.00245
[18] R. S. Sutton, A. G. Barto, Reinforcement learning. An introduction, Adapt. Comput. Mach. Learn., MIT Press, Cambridge, MA, 1998, 344 pp.
[19] C. Szepesvári, M. L. Littman, “A unified analysis of value-function-based reinforcement-learning algorithms”, Neural Computation, 11:8 (1999), 2017–2060 | DOI
[20] J. N. Tsitsiklis, “Asynchronous stochastic approximation and Q-learning”, Mach. Learn., 16:3 (1994), 185–202 | DOI | Zbl
[21] C. J. C. H. Watkins, Learning from delayed rewards, PhD thesis, Univ. of Cambridge, Cambridge, UK, 1989, vi+234 pp. http://www.cs.rhul.ac.uk/~chrisw/thesis.html
[22] M. Weinberg, J. S. Rosenschein, “Best-response multiagent learning in non-stationary environments”, Proceedings of the third international joint conference on autonomous agents and multiagent systems (AAMAS 2004) (New York, 2004), v. 2, IEEE Computer Soc., Washington, DC, 2004, 506–513