diff --git a/doc/LectureNotes/chapteroptimization.ipynb b/doc/LectureNotes/chapteroptimization.ipynb index 5d83c5b50..fea66ed89 100644 --- a/doc/LectureNotes/chapteroptimization.ipynb +++ b/doc/LectureNotes/chapteroptimization.ipynb @@ -2727,7 +2727,7 @@ }, "source": [ "$$\n", - "\\boldsymbol{\\mathbf{m}}_t={\\mathbf{m}_t \\over 1-\\beta_1^t} \\nonumber\n", + "\\hat{\\mathbf{m}}_t={\\mathbf{m}_t \\over 1-\\beta_1^t} \\nonumber\n", "$$" ] }, @@ -2739,7 +2739,7 @@ }, "source": [ "$$\n", - "\\boldsymbol{\\mathbf{s}}_t ={\\mathbf{s}_t \\over1-\\beta_2^t} \\nonumber\n", + "\\hat{\\mathbf{s}}_t ={\\mathbf{s}_t \\over1-\\beta_2^t} \\nonumber\n", "$$" ] }, @@ -2751,7 +2751,7 @@ }, "source": [ "$$\n", - "\\boldsymbol{\\theta}_{t+1}=\\boldsymbol{\\theta}_t - \\eta_t { \\boldsymbol{\\mathbf{m}}_t \\over \\sqrt{\\boldsymbol{\\mathbf{s}}_t} +\\epsilon}, \\nonumber\n", + "\\boldsymbol{\\theta}_{t+1}=\\boldsymbol{\\theta}_t - \\eta_t { \\hat{\\mathbf{m}}_t \\over \\sqrt{\\hat{\\mathbf{s}}_t} +\\epsilon}, \\nonumber\n", "$$" ] }, @@ -2786,8 +2786,8 @@ "Like in RMSprop, the effective step size of a parameter depends on the\n", "magnitude of its gradient squared. To understand this better, let us\n", "rewrite this expression in terms of the variance\n", - "$\\boldsymbol{\\sigma}_t^2 = \\boldsymbol{\\mathbf{s}}_t -\n", - "(\\boldsymbol{\\mathbf{m}}_t)^2$. Consider a single parameter $\\theta_t$. The\n", + "$\\hat{\\sigma}_t^2 = \\hat{\\mathbf{s}}_t -\n", + "(\\hat{\\mathbf{m}}_t)^2$. Consider a single parameter $\\theta_t$. The\n", "update rule for this parameter is given by" ] }, @@ -2799,7 +2799,7 @@ }, "source": [ "$$\n", - "\\Delta \\theta_{t+1}= -\\eta_t { \\boldsymbol{m}_t \\over \\sqrt{\\sigma_t^2 + m_t^2 }+\\epsilon}.\n", + "\\Delta \\theta_{t+1}= -\\eta_t { \\hat{\\mathbf{m}}_t \\over \\sqrt{\\hat{\\sigma}_t^2 + \\hat{m}_t^2 }+\\epsilon}.\n", "$$" ] },