minor tex tweaks + references

f20897a3 · tuhe · fc9fbc05 · f20897a3 · f20897a3 · f20897a3
Commit f20897a3 authored Nov 18, 2022 by tuhe
--- a/Latex/robustq.aux
+++ b/Latex/robustq.aux
@@ -21,8 +21,11 @@
 \citation{liu22a}
 \babel@aux{english}{}
 \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Bellman operator}{2}{subsection.1.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {2}Generalization:}{4}{section.2}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {3}Question: Can we use an embedding?}{5}{section.3}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.0.1}Defining objectives}{6}{subsubsection.3.0.1}\protected@file@percent }
 \bibstyle{alpha}
 \bibdata{pensum}
 \bibcite{liu22a}{LBB{$^{+}$}22}
-\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Ideas:}{2}{subsection.1.1}\protected@file@percent }
+\gdef \@abspage@last{8}
-\gdef \@abspage@last{2}
--- a/Latex/robustq.log
+++ b/Latex/robustq.log
-This is pdfTeX, Version 3.141592653-2.6-1.40.22 (TeX Live 2022/dev/Debian) (preloaded format=pdflatex 2022.8.30)  11 NOV 2022 11:04
+This is pdfTeX, Version 3.141592653-2.6-1.40.22 (TeX Live 2022/dev/Debian) (preloaded format=pdflatex 2022.8.30)  16 NOV 2022 14:30
 entering extended mode
 \write18 enabled.
 %&-line parsing enabled.
@@ -832,24 +832,24 @@ File: l3backend-pdftex.def 2022-01-12 L3 backend support: PDF output (pdfTeX)
 (./robustq.aux)
 \openout1 = `robustq.aux'.
-LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 59.
+LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
-LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 59.
+LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
-LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 59.
+LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
-LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 59.
+LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
-LaTeX Font Info:    Checking defaults for TS1/cmr/m/n on input line 59.
+LaTeX Font Info:    Checking defaults for TS1/cmr/m/n on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
-LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 59.
+LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
-LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 59.
+LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
-LaTeX Font Info:    Checking defaults for PD1/pdf/m/n on input line 59.
+LaTeX Font Info:    Checking defaults for PD1/pdf/m/n on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
-LaTeX Font Info:    Checking defaults for PU/pdf/m/n on input line 59.
+LaTeX Font Info:    Checking defaults for PU/pdf/m/n on input line 62.
-LaTeX Font Info:    ... okay on input line 59.
+LaTeX Font Info:    ... okay on input line 62.
 (/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii
 [Loading MPS to PDF converter (version 2006.09.02).]
@@ -865,7 +865,7 @@ LaTeX Font Info:    ... okay on input line 59.
 \makeMPintoPDFobject=\count336
 \everyMPtoPDFconversion=\toks42
 )
-Package hyperref Info: Link coloring OFF on input line 59.
+Package hyperref Info: Link coloring OFF on input line 62.
 (/usr/share/texlive/texmf-dist/tex/latex/hyperref/nameref.sty
 Package: nameref 2021-04-02 v2.47 Cross-referencing by name of section
@@ -877,52 +877,61 @@ Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
 )
 \c@section@level=\count337
 )
-LaTeX Info: Redefining \ref on input line 59.
+LaTeX Info: Redefining \ref on input line 62.
-LaTeX Info: Redefining \pageref on input line 59.
+LaTeX Info: Redefining \pageref on input line 62.
-LaTeX Info: Redefining \nameref on input line 59.
+LaTeX Info: Redefining \nameref on input line 62.
 (./robustq.out) (./robustq.out)
 \@outlinefile=\write4
 \openout4 = `robustq.out'.
-LaTeX Font Info:    Trying to load font information for U+msa on input line 61.
+LaTeX Font Info:    Trying to load font information for U+msa on input line 64.
 (/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsa.fd
 File: umsa.fd 2013/01/14 v3.01 AMS symbols A
 )
-LaTeX Font Info:    Trying to load font information for U+msb on input line 61.
+LaTeX Font Info:    Trying to load font information for U+msb on input line 64.
 (/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsb.fd
 File: umsb.fd 2013/01/14 v3.01 AMS symbols B
 )
-LaTeX Font Info:    Trying to load font information for U+lasy on input line 61
+LaTeX Font Info:    Trying to load font information for U+lasy on input line 64
 .
 (/usr/share/texlive/texmf-dist/tex/latex/base/ulasy.fd
 File: ulasy.fd 1998/08/17 v2.2e LaTeX symbol font definitions
 ) [1
-{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] (./robustq.bbl) [2] (./rob
+{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] [2] [3] [4]
-ustq.aux)
-Package rerunfilecheck Warning: File `robustq.out' has changed.
+Package hyperref Warning: Difference (2) between bookmark levels is greater 
-(rerunfilecheck)                Rerun to get outlines right
+(hyperref)                than one, level fixed on input line 239.
-(rerunfilecheck)                or use package `bookmark'.
-Package rerunfilecheck Info: Checksums for `robustq.out':
+[5]
-(rerunfilecheck)             Before: 9814C2D956916ADB9290ECA4E25C967A;103
+Overfull \hbox (18.79576pt too wide) in paragraph at lines 251--253
-(rerunfilecheck)             After:  7346C90302A32ACFC1BFA409C3838729;190.
+\OT1/cmr/m/n/12 Re-writing this, and as-sum-ing for sim-plic-ity re-ward only d
+e-pends on state/action,
+ []
+Overfull \hbox (12.91394pt too wide) detected at line 268
+[]
+ []
+[6] (./robustq.bbl [7]) [8] (./robustq.aux)
+Package rerunfilecheck Info: File `robustq.out' has not changed.
+(rerunfilecheck)             Checksum: 536FAB75290B38F04B1B63A7D8E9D0DF;749.
 ) 
 Here is how much of TeX's memory you used:
- 25956 strings out of 479648
+ 26020 strings out of 479648
- 502387 string characters out of 5886115
+ 503151 string characters out of 5886115
- 769346 words of memory out of 5000000
+ 774386 words of memory out of 5000000
- 43464 multiletter control sequences out of 15000+600000
+ 43472 multiletter control sequences out of 15000+600000
- 479388 words of font info for 70 fonts, out of 8000000 for 9000
+ 479695 words of font info for 71 fonts, out of 8000000 for 9000
 475 hyphenation exceptions out of 8191
- 99i,12n,104p,441b,352s stack positions out of 5000i,500n,10000p,200000b,80000s
+ 99i,15n,104p,477b,461s stack positions out of 5000i,500n,10000p,200000b,80000s
 </usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx12.pfb></us
 r/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmex10.pfb></usr/shar
 e/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi12.pfb></usr/share/texl
@@ -936,10 +945,10 @@ m/cmsy10.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy
 6.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy8.pfb><
 /usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmti12.pfb></usr/s
 hare/texlive/texmf-dist/fonts/type1/public/amsfonts/symbols/msbm10.pfb>
-Output written on robustq.pdf (2 pages, 148360 bytes).
+Output written on robustq.pdf (8 pages, 184305 bytes).
 PDF statistics:
- 108 PDF objects out of 1000 (max. 8388607)
+ 208 PDF objects out of 1000 (max. 8388607)
- 75 compressed objects within 1 object stream
+ 168 compressed objects within 2 object streams
- 13 named destinations out of 1000 (max. 500000)
+ 69 named destinations out of 1000 (max. 500000)
- 21 words of extra memory for PDF output out of 10000 (max. 10000000)
+ 53 words of extra memory for PDF output out of 10000 (max. 10000000)
--- a/Latex/robustq.out
+++ b/Latex/robustq.out
 \BOOKMARK [1][-]{section.1}{\376\377\000I\000n\000t\000r\000o\000d\000u\000c\000t\000i\000o\000n}{}% 1
-\BOOKMARK [2][-]{subsection.1.1}{\376\377\000I\000d\000e\000a\000s\000:}{section.1}% 2
+\BOOKMARK [2][-]{subsection.1.1}{\376\377\000B\000e\000l\000l\000m\000a\000n\000\040\000o\000p\000e\000r\000a\000t\000o\000r}{section.1}% 2
+\BOOKMARK [1][-]{section.2}{\376\377\000G\000e\000n\000e\000r\000a\000l\000i\000z\000a\000t\000i\000o\000n\000:}{}% 3
+\BOOKMARK [1][-]{section.3}{\376\377\000Q\000u\000e\000s\000t\000i\000o\000n\000:\000\040\000C\000a\000n\000\040\000w\000e\000\040\000u\000s\000e\000\040\000a\000n\000\040\000e\000m\000b\000e\000d\000d\000i\000n\000g\000?}{}% 4
+\BOOKMARK [2][-]{subsubsection.3.0.1}{\376\377\000D\000e\000f\000i\000n\000i\000n\000g\000\040\000o\000b\000j\000e\000c\000t\000i\000v\000e\000s}{section.3}% 5
--- a/Latex/robustq.pdf
+++ b/Latex/robustq.pdf
--- a/Latex/robustq.synctex.gz
+++ b/Latex/robustq.synctex.gz
--- a/Latex/robustq.tex
+++ b/Latex/robustq.tex
@@ -52,6 +52,9 @@
 \newcommand{\EE}{\mathbb{E}}
 \newcommand{\Var}{\operatorname{Var} }
 \newcommand{\std}{\operatorname{std} }
+\newcommand{\rob}{\mathrm{rob} }
+\newcommand{\KL}{\operatorname{KL} }
+\newcommand{\Trob}{\mathcal{T}^{\text{rob}}_\delta }
 \title{Robust Q learning}
 \author{Tue Herlau}
@@ -84,29 +87,231 @@ We also assume that the transition dynamics is jointly Gaussian, viz.:
 p_0(s'| s, a) = \mathcal{N}(x' |\mu^f(s,a), \sigma^f(s,a)^2 )
 \end{align}
+\subsection{Bellman operator}
+The bellman operator defined in the main reference is:
+\begin{align}
+	\Trob(Q)(s,a) = r(s,a)  + \gamma \sup_{\beta \geq 0}\left\{ 
+		-\beta \log \left( \EE_{p^0_{s,a} } \left[ e^{ -\frac{1}{\beta}\max_b Q(s',b) } \right] \right) - \beta \delta
+		\right\}
+\end{align}
 This parametrization now allows us to solve the integral and get an explicit expression for the minimization problem. The integral is a standard product of two Gaussians and it becomes: 
 \begin{align}
 \EE_{p_0}\left[ e^{ \frac{-V(s') }{\beta} } \right]
-& = e^{a} \sqrt{  2\pi \left( \frac{\sigma^v}{\sqrt{\beta} } \right)^2 } \left[ 
+& = e^{-\frac{a}{\beta}} \sqrt{  2\pi \beta \left( \sigma^v \right)^2 } \left[ 
 \frac{ e^{\frac{-1}{2}\frac{ \left( \mu^f - \mu^v \right)^2}{S_\beta^2}  } }{ 
 \sqrt{  2\pi S_\beta^2 }
 }
-\right], \quad S_\beta = \left( \sigma^f \right)^2 +  \frac{ \left( \sigma^v \right)^2 }{\beta}
+\right], \quad S_\beta^2 = \left( \sigma^f \right)^2 +  \beta \left( \sigma^v \right)^2 
+\end{align}
+We plug it in and get that the quantity to be optimized is:
+\begin{align}
+& a -\beta \log( \sqrt{2\pi} \sigma^v)  - \beta \frac{1}{2}\log \beta 
+ \beta \frac{1}{2}\frac{ \left( \mu^f - \mu^v \right)^2}{S_\beta^2}
+ \beta \log \sqrt{2\pi} + \beta \log S_\beta - \beta \delta     \\
+& = 
+a - \beta \frac{1}{2}\log \beta 
+ \beta \frac{1}{2}\frac{ \left( \mu^f - \mu^v \right)^2}{S_\beta^2}
+ + \beta \log S_\beta - \beta (\delta   + \log \sigma^v) \\ 
+ & = a -\beta \log \left(\frac{ \sqrt{\beta} \sigma^v }{ S_\beta } \right)
+ + \beta \frac{1}{2}\frac{ \left( \mu^f - \mu^v \right)^2}{S_\beta^2}
+ - \beta \delta  \\
+ & = a -\beta \log \left(\frac{ \sqrt{\beta} \sigma^v }{ \sqrt{ (\sigma^2)^2 + \beta (\sigma^v)^2 } } \right)
+ + \beta \frac{1}{2}\frac{ \left( \mu^f - \mu^v \right)^2}{S_\beta^2}
+ - \beta \delta  \\
+ & = 
+ a +\frac{1}{2}\beta \log \left( 1 + \frac{ (\sigma^f)^2 }{  \beta (\sigma^v)^2 }  \right)
+ + \beta \frac{1}{2}\frac{ \left( \mu^f - \mu^v \right)^2}{(\sigma^f)^2 + \beta (\sigma^v)^2 }
+ - \beta \delta  \\
+\end{align}
+(TODO: simplify that ridiculous mess).
+The questions is if we can help ourselves a bit in the optimization task by somehow showing this function has a single optimum or similar -- I am not sure that is possible, but you never know. A step towards this is to differentiate it. vlg., lets assume it has the form:
+\begin{align}
+F(x) & = x \log \left( 1+ \frac{\lambda}{x} \right) + \frac{A}{ \frac{\lambda}{x} +1 } - B x \\
+& = x \log h(x) + \frac{A}{h(x)} - Bx \\
+h(x) & = 1+ \frac{\lambda}{x} 
+\end{align}
+We differentiate to get:
+\begin{align}
+F'(x) & = \log h(x) + x \frac{h'(x) }{h(x)} + 
+\frac{h'(x) }{h(x) }\left( \frac{-A}{h(x)} \right)  
+- B \\
+& = \log h(x) + \frac{h'(x) }{h(x) }\left[\frac{h(x) x - A}{h(x) } \right] - B \\
+\frac{h'(x) }{h(x)}& = \frac{
+	-\frac{\lambda}{x^2} 
+}{
+1+ \frac{\lambda}{x} 
+} = \frac{1}{x}\frac{-\lambda}{x + \lambda}
+%\log \left( 1+ \frac{\lambda}{x} \right) + \frac{A}{ \frac{\lambda}{x} +1 } - B x \\
+%x \log h(x) + \frac{A}{h(x)} - Bx \\
+%h(x) & = 1+ \frac{\lambda}{x} 
 \end{align}
- This objective can then be maximized wrt. $\beta$ using a simple bisection algorithm or similar. 
+So therefore 
+\begin{align}
+F'(x) & =  \log h(x)-B - \frac{\lambda}{x + \lambda}
+\left[ \frac{x h(x) - A}{x h(x)} \right] \\
+& = \log h(x) - B - \frac{\lambda}{x + \lambda}
+\left[ \frac{x + \lambda - A}{x + \lambda } \right] 
+\end{align}
+So we can now start to bound this. We get that
+\begin{align}
+F'(x) \leq \frac{\lambda}{x} - B - \frac{\lambda}{x + \lambda}
+\left[ \frac{x + \lambda - A}{x + \lambda } \right] 
+\end{align}
+Solving for $F'(x) \leq 0$ can be done by reducing this to a 3rd degree polynomial and solve for the smallest root. Less exactly, we can also just ignore higher-order terms in $x$ and solve to get:
+\begin{align}
+0 & < \frac{\lambda}{x} - B - \frac{\lambda}{x + \lambda} + \frac{A}{(x + \lambda)x}  \\
+x & >  \frac{1}{2B} \sqrt{ (\lambda B)^2 + 4 B (\lambda^2 + \lambda A) }-\frac{\lambda}{2} 
+\end{align}
+This can be simplified obviously (TODO). The lower bound can be obtained by simply dropping terms to get:
+\begin{align}
+	F'(x) & > \log h(x) - B - 1 \\
+x	& \leq \frac{\lambda }{  e^{B + 1}- 1 }
+\end{align}
+(TODO: Much better lower bound can be obtained by back-substituting upper and lower bounds into expression for $F'$ and solve again for lower bound. This will give explicit $A$-dependency.
+This gets us an interval for $x$ within which to search for a root using the bisection method:
+\begin{align}
+x \in \left[  
+ \frac{\lambda }{  e^{B + 1}- 1 }\ ; \   \frac{1}{2B} \sqrt{ (\lambda B)^2 + 4 B (\lambda^2 + \lambda A) }-\frac{\lambda}{2} \right]
+\end{align}
-This leaves two things that need to be learnable. Firstly, the $s'$ distribution. This can be learned using standard techniques for learning a stochastic neural networks (i.e. minimize the log likelihood). Then, to learn the value function, knowing it represents 
+\section{Generalization:}
+Suppose the dynamics of the model is given by:
 \begin{align}
-V(s) = \max Q(s,b)
+x_{k+1} & = f(x_k, a_k, L\epsilon_k) \\
+w_k & \sim P_W(\cdot) 
 \end{align}
-This too can be learned using gradient descent. We can even let it share layers with the dynamics. Then finally the $Q$-function must be learned using the new (robust) optimization targets. 
+Where $L$ is a linear operator (let's see). We then define the state as:
+\begin{align}
+s_k \equiv \begin{bmatrix}
+	x_k \\ x_{k-1} \\ a_{k-1} \\ L\epsilon_k 
+\end{bmatrix}
+\end{align}
+This definition contains redundant information. The update rule is then
+\begin{align}
+s_{k+1} = \bar f(s_k, a_k, L\epsilon_k) 
+= \begin{bmatrix}
+	f(x_k, a_k, L\epsilon_k) \\ x_k \\ a_k  \\ L \epsilon_k
+\end{bmatrix}
+\end{align}
+Then the hard part of the update rule is:
+\begin{align}
+\EE_{p^0_{s,a} } \left[e^{-\frac{V(s')}{\beta} } \right]
+\end{align}
+Where $p_0$ is the dynamics of the (unperturbed) version of the system.
+To compute the expectation, we will assume a special form of the value function namely:
+\begin{align}
+V(s_{k+1} ) \approx \hat V(s_{k+1} ) = V^{(1)}(x_k,a_k) + V^{(2)}(x_k,a_k)^{\top} w_k + \frac{1}{2}w_k^\top V^{(3)}(x_k,a_k)w_k
+\end{align}
+We will also assume the transition dynamics takes the form:
+\begin{align}
+p^0_{s,a} \approx \hat p^0_{s,a}(s' | s,a) = 
+ V^{(1)}(x_k,a_k) + V^{(2)}(x_k,a_k)^{\top} w_k + \frac{1}{2}w_k^\top V^{(3)}(x_k,a_k)w_k\\
+\end{align}
+Then the transition function is assumed to be:
+\begin{align}
+x_{k+1} \approx \bar{f}(x_k, a_k, w_k) = f^{(1)}(x_k, a_k) + 
+f^{(2)}(x_k, a_k)^\top w_k
+ \frac{1}{2} w_k^\top  f^{(3)}(x_k, a_k) w_k
+\end{align}
+The expectation can then still be computed exactly, and the coefficients above can be learned using regression (even local linear?). 
+\section{Question: Can we use an embedding?}
+Assume a new generative model of the form:
+\begin{align}
+x_{k+1} = f(h(x_k, a_k) + w_k)
+\end{align}
+Where $h$ is an embedding and $w_k$ is assumed to be $\mathcal N(0,I)$. Let $g = f^{-1}$ so that 
+\begin{align}
+g(x_{k+1} ) = h(x_k, a_k) + w_k.
+\end{align}
+Then it holds that
+\begin{align}
+p(x_{k+1} | x_k, a_k)dx_{k+1} = \mathcal N(w_k=g(x_{k+1} )-h(x_k, a_k); 0, I ) |J_g(x_{k+1} ) | dw_k
+\end{align}
+We also assume that $x_{k-1}$, $a_{k-1}$ and $w_{k-1}$ are part of the state $s_k$. Therefore we can introduce the approximation (assuming $W$ is a second degree polynomial):
+\begin{align}
+V(s_{k+1}) \approx \hat V(s_{k+1} ) = \hat V(x_k, a_k, w_k) = W_{h(x_k, a_k)}(w_k; \Phi).
+\end{align}
+\subsubsection{Defining objectives}
+When learning the objectives, we train $Q$ by minimizing the specific objective given in the paper (viz., using the simulator) as:
+\begin{align}
+Q(s,a) = ...
+\end{align}
+So you could just as well train $V$ directly and use the simulator to define $Q$... but this way is more elegant (or is it?). Robust learning requires $V$ internally (to get 2nd degree polyal). Or does it?
+The $Q$-learning objective satisfy the Bellman equation
+\begin{align}
+Q(s_k,a_k) = \inf_{\KL(p|p_0)<\delta }\left\{  
+	r(s_{k}, a_{k} ) + \gamma \max_b Q(s_{k+1}, b)
+	\right\}
+\end{align}
+Re-writing this, and assuming for simplicity reward only depends on state/action, we get that for any constant $C$:
+\begin{align}
+Q^\rob(s_k,a_k) & = r(s_{k}, a_k) + \sup_{\beta >0}\left\{ -\beta \log  \EE\left[ e^{- \frac{\max_b Q^\rob(s_{k+1}, b) }{\beta} }\right] + \delta \beta
+	\right\} \\
+	& = r(s_{k}, a_k) + C + \sup_{\beta >0}\left\{ -\beta \log  \EE\left[ e^{- \frac{V^\rob(s_{k+1} ) - C }{\beta} }\right] + \delta \beta
+	\right\} \\
+V^\rob(s_k) & = \max_b Q^{\rob}(s_k, b)
+\end{align}
+for any constant $C$ (probably a bad idea to have such a constant). Then to proceed, we need to find a way to compute the above expectation exactly and update. Let's say we divide this into two parts. The first computes this expectation (and updates). The second considers how to fit $V^\rob$. For each $V^\rob$, we can compute the target easily. Then perform the update as a regression problem (batched) which is trained jointly with $o$. IOW, we get that:
+\begin{align}
+V^\rob(s_k) & = \max_b Q^\rob(s_k, b) \\
+& \approx \hat V^\rob(s_k) \\
+& = C_0( h(x_{k-1}, a_{k-1}) ) 
+ C_1( h(x_{k-1}, a_{k-1}) ) w_k 
+ \frac{1}{2} w_k^\top C_2( h(x_{k-1}, a_{k-1}) )  w_k \\
+w_k & = g(x_k) - h(x_{k-1}, a_{k-1} )
+\end{align}
+Train this using gradient descent, probably keeping $C_2$ diagonal. 
+What happens if you use the $Q$-learning function and try to integrate? Let each coordinate be a 2nd degree polynomial. 
+$V$ to minimize the recursion:
+is a second-order approximation of $V$ only depending on the latent state $h(x_k, a_k)$. We can learn the latent embedding by minimizing a loss over both $V$ and over $g$ (issue: $g$ most well-defined. Possibly fix $h$ insofar as second-order approx is concerned and use that $\epsilon$ automatically normal? Alternatively use that $V$ must satisfy, before optimization, that:
+\begin{align}
+V(s_{k+1}) = \max a_k \EE\left[ r(x_k, a_k) + V(x_{k+1}  )  \right] 
+\end{align}
+Then it holds that
+\begin{align}
+J_g(x_{k+1})
+\end{align}
+that $h_k = h(x_k, a_k, w_k)$ is a latent embedding of some sort and the next state is given by:
+\begin{align}
+x_{k+1} = o(h_k)
+\end{align}
+Given embedding, we can write the value function as a quadratic in embedding parameter? 
+ In other word the expectation is with respect to
+\begin{align}
+0
+\end{align}
+However, we can re-write this to be:
-\subsection{Ideas:}
 $$
-s' = h(s, \epsilon)
+p(s' | s,a)
 $$
-I.e. let $s'$ be a function of something that looks Gaussian. Idea is that $V(s')$ should look 
+We will then assume a special parametrization of the value function in the form that:
+\begin{align}
+V(s') \approx V^{(1)}(s) + V^{(2)}(s) L \epsilon + (L\epsilon)^\top V^{(3)} (s)(L\epsilon)
+\end{align}
+There are some questions about whether this is a rigorous construction or not. The three matrices need to be learned using a neural network; this is done by simply regressing 
 \bibliographystyle{alpha}
 \bibliography{pensum}

--- a/docs/1605.08803.pdf
+++ b/docs/1605.08803.pdf
--- a/docs/liu22a.pdf
+++ b/docs/liu22a.pdf
--- a/src/plots.py
+++ b/src/plots.py
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy
+import math
+def normpdf(x, mean, sd):
+    var = float(sd)**2
+    denom = (2*math.pi*var)**.5
+    num = math.exp(-(float(x)-float(mean))**2/(2*var))
+    return num/denom
+def norm_pdf(x, mu, sigma):
+    return np.exp( -(x-mu)**2/(2*sigma**2)  ) / np.sqrt(2*np.pi* sigma**2 )
+def gaussint(mu, sigma):
+    pass
+if __name__ == "__main__":
+    betas = np.linspace(0.01, 100, 1000)
+    sigma_v = 1
+    sigma_f = 1
+    mu_f = 1
+    mu_v = 2
+    lam = .01
+    A = 1
+    B = 10
+    # F = betas * 0
+    # x = betas
+    # xup = lam / 2 * (np.sqrt( 1 + 4/B  ) - 1)
+    xup = -lam/2 + 1/(2*B)*np.sqrt( (lam * B)**2 + 4*B*(lam**2 + lam*A))
+    xlw = lam/( np.exp( B + 1) -1 )
+    x = np.linspace(xlw, xup, 100)
+    h = 1 + lam/x
+    F = x * np.log(h) + A / h - B * x
+    Fp = np.log(h) - B - lam / (x + lam) * ( 1 - A/(x+lam)  )
+    fig, ax= plt.subplots(1,2)
+    ax[0].plot(x, F, 'k-')
+    ax[0].plot([xup, xup], plt.ylim(), 'ro-')
+    ax[0].plot([xlw, xlw], plt.ylim(), 'go-')
+    ax[1].plot(x, Fp, 'k-')
+    ax[1].plot(x, x*0, 'b-')
+    ax[1].plot([xup, xup], plt.ylim(), 'ro-')
+    ax[1].plot([xlw, xlw], plt.ylim(), 'go-')
+    print(    [np.min(Fp), np.max(Fp)])
+    plt.show()
+    a = 234
+    for k, b in enumerate(betas):
+        print(k)
+        x = b