LaTeX draft

f3fab20d · tuhe · 2b947597 · f3fab20d · f3fab20d · f3fab20d
Commit f3fab20d authored Nov 10, 2022 by tuhe
--- a/Latex/pensum.bib
+++ b/Latex/pensum.bib
+
+@InProceedings{liu22a,
+	title = 	 {Distributionally Robust $Q$-Learning},
+	author =       {Liu, Zijian and Bai, Qinxun and Blanchet, Jose and Dong, Perry and Xu, Wei and Zhou, Zhengqing and Zhou, Zhengyuan},
+	booktitle = 	 {Proceedings of the 39th International Conference on Machine Learning},
+	pages = 	 {13623--13643},
+	year = 	 {2022},
+	editor = 	 {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
+	volume = 	 {162},
+	series = 	 {Proceedings of Machine Learning Research},
+	month = 	 {17--23 Jul},
+	publisher =    {PMLR},
+	pdf = 	 {https://proceedings.mlr.press/v162/liu22a/liu22a.pdf},
+	url = 	 {https://proceedings.mlr.press/v162/liu22a.html},
+	abstract = 	 {Reinforcement learning (RL) has demonstrated remarkable achievements in simulated environments. However, carrying this success to real environments requires the important attribute of robustness, which the existing RL algorithms often lack as they assume that the future deployment environment is the same as the training environment (i.e. simulator) in which the policy is learned. This assumption often does not hold due to the discrepancy between the simulator and the real environment and, as a result, and hence renders the learned policy fragile when deployed. In this paper, we propose a novel distributionally robust $Q$-learning algorithm that learns the best policy in the worst distributional perturbation of the environment. Our algorithm first transforms the infinite-dimensional learning problem (since the environment MDP perturbation lies in an infinite-dimensional space) into a finite-dimensional dual problem and subsequently uses a multi-level Monte-Carlo scheme to approximate the dual value using samples from the simulator. Despite the complexity, we show that the resulting distributionally robust $Q$-learning algorithm asymptotically converges to optimal worst-case policy, thus making it robust to future environment changes. Simulation results further demonstrate its strong empirical robustness.}
+}
--- a/Latex/pensum.bib.tex
+++ b/Latex/pensum.bib.tex
+
+@InProceedings{liu22a,
+	title = 	 {Distributionally Robust $Q$-Learning},
+	author =       {Liu, Zijian and Bai, Qinxun and Blanchet, Jose and Dong, Perry and Xu, Wei and Zhou, Zhengqing and Zhou, Zhengyuan},
+	booktitle = 	 {Proceedings of the 39th International Conference on Machine Learning},
+	pages = 	 {13623--13643},
+	year = 	 {2022},
+	editor = 	 {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
+	volume = 	 {162},
+	series = 	 {Proceedings of Machine Learning Research},
+	month = 	 {17--23 Jul},
+	publisher =    {PMLR},
+	pdf = 	 {https://proceedings.mlr.press/v162/liu22a/liu22a.pdf},
+	url = 	 {https://proceedings.mlr.press/v162/liu22a.html},
+	abstract = 	 {Reinforcement learning (RL) has demonstrated remarkable achievements in simulated environments. However, carrying this success to real environments requires the important attribute of robustness, which the existing RL algorithms often lack as they assume that the future deployment environment is the same as the training environment (i.e. simulator) in which the policy is learned. This assumption often does not hold due to the discrepancy between the simulator and the real environment and, as a result, and hence renders the learned policy fragile when deployed. In this paper, we propose a novel distributionally robust $Q$-learning algorithm that learns the best policy in the worst distributional perturbation of the environment. Our algorithm first transforms the infinite-dimensional learning problem (since the environment MDP perturbation lies in an infinite-dimensional space) into a finite-dimensional dual problem and subsequently uses a multi-level Monte-Carlo scheme to approximate the dual value using samples from the simulator. Despite the complexity, we show that the resulting distributionally robust $Q$-learning algorithm asymptotically converges to optimal worst-case policy, thus making it robust to future environment changes. Simulation results further demonstrate its strong empirical robustness.}
+}
--- a/Latex/robustq.aux
+++ b/Latex/robustq.aux
+\relax 
+\providecommand\hyper@newdestlabel[2]{}
+\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
+\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
+\global\let\oldcontentsline\contentsline
+\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
+\global\let\oldnewlabel\newlabel
+\gdef\newlabel#1#2{\newlabelxx{#1}#2}
+\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
+\AtEndDocument{\ifx\hyper@anchor\@undefined
+\let\contentsline\oldcontentsline
+\let\newlabel\oldnewlabel
+\fi}
+\fi}
+\global\let\hyper@last\relax 
+\gdef\HyperFirstAtBeginDocument#1{#1}
+\providecommand\HyField@AuxAddToFields[1]{}
+\providecommand\HyField@AuxAddToCoFields[2]{}
+\providecommand\babel@aux[2]{}
+\@nameuse{bbl@beforestart}
+\citation{liu22a}
+\babel@aux{english}{}
+\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
+\bibstyle{alpha}
+\bibdata{pensum}
+\bibcite{liu22a}{LBB{$^{+}$}22}
+\gdef \@abspage@last{2}
--- a/Latex/robustq.bbl
+++ b/Latex/robustq.bbl
+\newcommand{\etalchar}[1]{$^{#1}$}
+\begin{thebibliography}{LBB{\etalchar{+}}22}
+
+\bibitem[LBB{\etalchar{+}}22]{liu22a}
+Zijian Liu, Qinxun Bai, Jose Blanchet, Perry Dong, Wei Xu, Zhengqing Zhou, and
+  Zhengyuan Zhou.
+\newblock Distributionally robust $q$-learning.
+\newblock In Kamalika Chaudhuri, Stefanie Jegelka, Le~Song, Csaba Szepesvari,
+  Gang Niu, and Sivan Sabato, editors, {\em Proceedings of the 39th
+  International Conference on Machine Learning}, volume 162 of {\em Proceedings
+  of Machine Learning Research}, pages 13623--13643. PMLR, 17--23 Jul 2022.
+
+\end{thebibliography}
--- a/Latex/robustq.blg
+++ b/Latex/robustq.blg
+This is BibTeX, Version 0.99d (TeX Live 2022/dev/Debian)
+Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
+The top-level auxiliary file: robustq.aux
+The style file: alpha.bst
+Database file #1: pensum.bib
+You've used 1 entry,
+            2543 wiz_defined-function locations,
+            569 strings with 4844 characters,
+and the built_in function-call counts, 925 in all, are:
+= -- 83
+> -- 65
+< -- 2
+ -- 23
+- -- 23
+* -- 84
+:= -- 151
+add.period$ -- 4
+call.type$ -- 1
+change.case$ -- 11
+chr.to.int$ -- 1
+cite$ -- 1
+duplicate$ -- 31
+empty$ -- 53
+format.name$ -- 23
+if$ -- 191
+int.to.chr$ -- 1
+int.to.str$ -- 0
+missing$ -- 1
+newline$ -- 9
+num.names$ -- 5
+pop$ -- 11
+preamble$ -- 1
+purify$ -- 12
+quote$ -- 0
+skip$ -- 29
+stack$ -- 0
+substring$ -- 55
+swap$ -- 17
+text.length$ -- 2
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 8
+warning$ -- 0
+while$ -- 7
+width$ -- 2
+write$ -- 18
--- a/Latex/robustq.log
+++ b/Latex/robustq.log
--- a/Latex/robustq.out
+++ b/Latex/robustq.out
+\BOOKMARK [1][-]{section.1}{\376\377\000I\000n\000t\000r\000o\000d\000u\000c\000t\000i\000o\000n}{}% 1
--- a/Latex/robustq.pdf
+++ b/Latex/robustq.pdf
--- a/Latex/robustq.synctex.gz
+++ b/Latex/robustq.synctex.gz
--- a/Latex/robustq.tex
+++ b/Latex/robustq.tex
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% DO NOT EDIT THIS FILE. IT WILL BE AUTOMATICALLY OVERWRITTEN
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+% TeX-command-extra-options: "-shell-escape"
+
+
+\documentclass[12pt,twoside]{article}
+\usepackage[table]{xcolor} % important to avoid options clash.
+%\input{book_preamble}
+% This preamble file is supposed to be shared with the slides.
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{tcolorbox}
+\usepackage{etoolbox}
+\usepackage{hyperref}
+
+\usepackage{cleveref}
+
+\usepackage{url}
+\usepackage{xspace}
+\usepackage{url}
+\usepackage{graphics}
+%\usepackage{fancybox}
+\usepackage{multicol}
+\usepackage{rotate}
+%\usepackage{epsf}
+\usepackage{rotating}
+%\usepackage{color}
+\usepackage{booktabs}
+
+\usepackage{pifont}
+\usepackage{latexsym}
+\usepackage[english]{babel}
+\usepackage{epstopdf}
+\usepackage{etoolbox}
+%\usepackage{epsfig}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{multirow,epstopdf}
+%\usepackage{cite}
+\usepackage{fancyhdr}
+\usepackage{booktabs}
+%\usepackage[most]{tcolorbox}
+\definecolor{LightGray}{HTML}{EEEEEE}
+
+\usepackage{todonotes}
+
+\newcommand{\m}[1]{\boldsymbol{ #1}}
+
+\newcommand{\EE}{\mathbb{E}}
+\newcommand{\Var}{\operatorname{Var} }
+\newcommand{\std}{\operatorname{std} }
+
+\title{Robust Q learning}
+\author{Tue Herlau}
+
+\begin{document}
+\maketitle
+
+\section{Introduction}
+The work is an extension of~\cite{liu22a}. 
+The idea is fairly straight-forward. The $Q$-update which is robust within a KL distance of $\delta$ is given as a minimization problem. The implementation, however, really sucks. 
+
+We try to fix this by decompositing the value function. Let the temporal observations be:
+\begin{align}
+x_0, x_1, \cdots
+\end{align}
+
+We define a state as a block of $K$ of these:
+\begin{align}
+s_t = x_{t-K+1:t}
+\end{align}
+This means that the transition probability of the states is partly deterministic as the first $K-1$ elements are simply copied into the last $K-1$ elements of the next state.
+
+Let $\hat s_t = x_{t-K+1:t-1}$ and $s_t = s = \begin{bmatrix} \hat s & x\end{bmatrix}$ Then we parameterize the optimal value function $V(s)$ as a quadratic function:
+\begin{align}
+V(s) \approx \hat V(s) = a(\hat s) + \frac{1}{2\sigma^v(\hat s)^2 }(\mu(\hat s) - x)^2 
+\end{align}
+
+We also assume that the transition dynamics is jointly Gaussian, viz.:
+\begin{align}
+p_0(s'| s, a) = \mathcal{N}(x' |\mu^f(s,a), \sigma^f(s,a)^2 )
+\end{align}
+
+This parametrization now allows us to solve the integral and get an explicit expression for the minimization problem. The integral is a standard product of two Gaussians and it becomes: 
+\begin{align}
+\EE_{p_0}\left[ e^{ \frac{-V(s') }{\beta} } \right]
+& = e^{a} \sqrt{  2\pi \left( \frac{\sigma^v}{\sqrt{\beta} } \right)^2 } \left[ 
+\frac{ e^{\frac{-1}{2}\frac{ \left( \mu^f - \mu^v \right)^2}{S_\beta^2}  } }{ 
+\sqrt{  2\pi S_\beta^2 }
+}
+\right], \quad S_\beta = \left( \sigma^f \right)^2 +  \left( \sigma^v \right)^2
+\end{align}
+ This objective can then be maximized wrt. $\beta$ using a simple bisection algorithm or similar. 
+ 
+ 
+\bibliographystyle{alpha}
+\bibliography{pensum}
+
+\end{document}