%-----------------------------------------------------------------------
% Beginning of chap5.tex
%-----------------------------------------------------------------------
%
% AMS-LaTeX 1.2 sample file for a monograph, based on amsbook.cls.
% This is a data file input by chapter.tex.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\part{This is a Part Title Sample}
\chapter{Nonlinear Programming}
A nonlinear programming problem has the form
$$\min f(x)$$
subject to the inequality constraints
$$g_i (x) \le 0$$
for $i=1, \dots, p$ and the equality constraints
$$h_i(x) = 0 $$
for $i = 1, \dots, q$.
\begin{example}
Given a fixed area of cardboard $A$ construct a box of maximum volume. The nonlinear
program for this is
$$\min xyz$$
subject to
$$2xy +2xz + 2yz = A$$
\end{example}
\begin{example}
\label{school-prog}
Consider the problem of determining locations for two new high schools in a set of
$P$ subdivisions $N_j$. Let $w_{1j}$ be the number of students going to school A
and $w_{2j}$ be the number of students going to school B from subdivision $N_j$.
Assume that the student capacity of school A is $c_1$ and the capacity of school
B is $c_2$ and that the total number of students in each subdivision is $r_j$.
We would like to minimize the total distance traveled by all the students
given that they may attend either school A or B. It is
possible to construct a nonlinear program to determine the
locations $(a,b)$ and $(c,d)$ of high schools A and B, respectively
assuming the location of each subdivision $N_i$ is model as a single point
denoted $(x_i, y_i)$.
$$\min \sum_{j=1}^P w_{1j}\biggl( (a-x_j)^2 + (b-y_j)^2 \biggr)^{\frac{1}{2}} +
w_{2j}\biggl( (c-x_j)^2 + (d-y_j)^2 \biggr)^{\frac{1}{2}}$$
subject to the constraints
$$\sum_j w_{ij} \le c_j$$
$$w_{1j}+w_{2j} = r_j$$
for $j = 1, \dots, P$.
\end{example}
\begin{example}
Neural networks have provided a new tool for approximating
functions where the functional form is unknown. The approximation
takes on the form
$$f(x) = b_j \sigma(a_jx -\alpha_j) - \beta$$
and the corresponding sum of squares error term is
$$E(a_j, b_j , \alpha_j, \beta) = \sum_i \bigl(y_i - f(x_i)\bigr)^2$$
The problem of minimizing the error function is, in this instance,
an unconstrained optimization problem. An efficient means for
computing the gradient of $E$ is known as the {\it backpropogation}
algorithm.
\end{example}
\section{Unconstrained Optimization in One Dimension}
Here we begin by considering a significantly simplified (but
nonetheless important) nonlinear programming problem, i.e.,
the domain and range of the function to be minimized are
one-dimensional and there are no constraints. A necessary
condition for a minimum of a function was developed in calculus
and is simply
$$f'(x) = 0$$
Note that higher derivative tests can determine
whether the function is a max or a min, or the
value $f(x+\delta)$ may be compared to $f(x)$.
Note that if we let
$$g(x) = f'(x)$$
then we may convert the problem of finding a minimum or maximum
of a function to the problem of finding a zero.
\subsection{Bisection Algorithm}
Let $x^*$ be a root, or zero, of $g(x)$, i.e., $g(x^*) = 0$.
If an initial bracket $[a, b]$ is known such that $x^* \in [a,b]$,
then a simple and robust approach to determining the root
is to bisect this interval into two intervals $[a, c]$ and $[c, b]$
where $c$ is the midpoint, i.e.,
$$ c = \frac{a+b}{2}$$
If
$$g(a)g(c) < 0$$
then we conclude
$$x^* \in [a,c]$$
while if
$$g(b)g(c) < 0$$
then we conclude
$$x^* \in [b,c]$$
This process may now be iterated such that the size of the
bracket (as well as the actual error of the estimate)
is being divided by 2 every iteration.
\subsection{Newton's Method}
Note that in the bisection method the actual value of the function
$g(x)$ was only being used to determine the correct
bracket for the root. Root finding is accelerated considerably
by using this function information more effectively.
For example, image we were seeking the root of a function
that was a straight line, i.e., $g(x) = a x +b$ and our initial
guess for the root was $x_0$. If we extend this straight line
from the point $x_0$ it is easy to determine where it crosses the axis, i.e.,
$$ax_1+b=0$$
so $x_1 = -b/a$. Of course, if the function were truly
linear then no first guess would be required. So now consider the
case that $g(x)$ is nonlinear but may be approximated locally about the
point $x_0$ by a line. Then the point of intersection of this
line with the $x$-axis is an estimate, or second guess, for the root
$x^*$. The linear approximation comes from Taylor's theorem, i.e.,
$$g(x) = g(x_0) + g'(x_0) (x-x_0) + \frac{1}{2} g''(x_0) (x-x_0)^2+\dots$$
So the linear approximation to $g(x)$ about the point $x_0$ can be written
$$l(x) = g(x_0) + g'(x_0) (x-x_0)$$
If we take $x_1$ to be the root of the linear approximation we have
$$l(x_1) = 0 = g(x_0) + g'(x_0) (x_1-x_0)$$
Solving for $x_1$ gives
$$x_1 = x_0 - \frac{g(x_0)}{g'(x_0)}$$
or at the $n$th iteration
$$x_{n+1} = x_n - \frac{g(x_n)}{g'(x_n)}$$
The iteration above is for determining a zero of a function $g(x)$.
To determine a maximum or minimum value of a function $f$
we employ condition that $f'(x)=0$. Now the iteration is
modified as as
$$x_{n+1} = x_n - \frac{f'(x_n)}{f''(x_n)}$$
\section{Unconstrained Optimization in Higher Dimensions}
Now we consider the problem of minimizing (or maximizing) a scalar
function of many variables, i.e., defined on a vector field. We consider the
extension of Newton's method presented in the previous section
as well as a classical approach known as steepest descent.
\subsection{Taylor Series in Higher Dimensions}
Before we extend the search for extrema to higher dimensions we
present Taylor series for functions of more than one domain variable.
To begin, the Taylor series for a function of two variables is given by
\begin{align*}
g(x, y) = &g(x_0,y_0) +
\frac{ \partial g}{ \partial x} (x -x_0)+
\frac{ \partial g}{ \partial y} (y-y_0) \\
&+\frac{ \partial^2 g}{ \partial x^2} \frac{(x-x_0)^2}{2} +
\frac{ \partial^2 g}{ \partial y^2} \frac{(y-y_0)^2}{2} +
\frac{ \partial^2 g}{ \partial x \partial y} {(x-x_0)(y-y_0)}\\
&+\mbox{higher order terms}
\end{align*}
In $n$ variables $x = (x_1, \dots, x_n)$ the Taylor series expansion becomes
$$g(x) = g(x^{(0)}) + \nabla g(x^{(0)}) (x-x^{(0)}) + \frac{1}{2} (x-x^{(0)})^T Hg(x^{(0)}) (y-y^{(0)})+\cdots$$
where the Hessian matrix is defined as
$$\bigl(Hg(x^{(0)})\bigr)_{ij} = \frac{\partial^2 g}{\partial x_i \partial x_j}\biggl|_{x=x^{(0)}}$$
and the gradient is written as a row vector, i.e.,
$$\bigl(\nabla g(x^{(0)})\bigr)_i = \frac{\partial g}{\partial x_i}\biggl|_{x=x^{(0)}}$$
\subsection{Roots of a Nonlinear System}
We saw that Newton's method could be used to develop an iteration
for determining the zeros of a scalar function. We can extend those
ideas for determining roots of the nonlinear system
\begin{align*}
g_1(x_1, \dots, x_n) &= 0 \\
g_2(x_1, \dots, x_n) &= 0 \\
&\vdots \\
g_n(x_1, \dots, x_n) &= 0
\end{align*}
or, more compactly,
$$g(x) = 0.$$
Now we apply Taylor's theorem to each component $g_i(x_1, \dots, x_n)$
individually, i.e., retaining only the first order terms we have
the linear approximation to $g_i$ about the point $x^{(0)}$ as
$$l_i(x) = g_i(x^{(0)}) + \nabla g_i(x^{(0)}) (x-x^{(0)})$$
for $i=1, \dots, n$.
We can write these components together as a vector equation
$$l(x) = g(x^{(0)}) + \nabla g(x^{(0)}) (x-x^{(0)})$$
where now
$$\bigl(\nabla g(x^{(0)})\bigr) = \frac{\partial g_i}{\partial x_j} \biggl|_{x=x^{(0)}}$$
As in the scalar case we base our iteration on the assumption that
$$l(x^{(k+1)}) = 0$$
Hence,
$$g(x^{(k)}) + \nabla g(x^{(k)}) (x^{(k+1)}-x^{(k)}) = 0$$
and given $x^{(k)}$ we may determine the next iterate $x^{(k+1)}$
by solving an $n \times n$ system of equations.
\subsection{Unconstrained Minimization with Newton's Method}
In this chapter we are interested in minimizing functions of several variables.
Analogously with the scalar variable case we may modify the above root finding method
to determine maximum (or minima) of a function $f(x_1, \dots, x_n)$.
To compute an extreme point we require that $\nabla f = 0$
hence we set
$$g(x) = \bigl(\frac{\partial f}{\partial x_1}, \dots, \frac{\partial f}{\partial x_n}\bigr)^T$$
Substituting
$$g_i(x) = \frac{\partial f}{\partial x_i}$$
into
$$g(x^{(k)}) + \nabla g(x^{(k)}) (x^{(k+1)}-x^{(k)}) = 0$$
produces
$$\nabla f(x^{(k)}) + Hf(x^{(k)}) (x^{(k+1)}-x^{(k)})=0$$
where
$$\bigl(Hf(x^{(k)})\bigr)_{ij} = \frac{\partial g_i}{\partial x_j} = \frac{\partial^2 f}{\partial x_i \partial x_j}$$
Again we have a linear system for $x^{(k+1)}$.
\section{Steepest Descent}
Another form for Taylor's formula in $n$-variables is given by
$$f(x+th) = f(x) + t \nabla f(x) h + \frac{t^2}{2} h^T Hf(x) h$$
where again $(\nabla f(x))_i = \partial f/\partial x_i$ and
$(Hf(x))_{ij} = \partial^2 f/\partial x_i \partial x_j$.
Now $t$ is a scalar and $x+th$ is a ray emanating from the point
$x$ in the direction $h$. We can compute the derivative of
the function $f$ w.r.t. $t$ as
$$\frac{df}{dt}(x+th) = \nabla f(x) h + t h^T Hf(x) f$$
Evaluating the derivative at the point $t=0$ gives
$$\frac{df}{dt}(x) = \nabla f(x) h$$
This quantity, known as the directional derivative, indicates how
the function is changing at the point $x$ in the direction $h$.
Recall from calculus that the direction of maximum increase (decrease)
of a function is in the direction of the gradient (negative gradient).
This is readily seen from the formula for the directional derivative
using the identity
$$ \nabla f(x) h = \|\nabla f(x)\| \| h \| \cos(\theta)$$
We can assume without loss of generality that $h$ is of unit
length, i.e., $\| h \|=1$. So the quantity on the right is a maximum
when the vectors $h$ and $\nabla f(x)$ point in the same direction
so $\theta = 0$.
This observation may be used to develop an algorithm for unconstrained
function minimization. With an appropriate choice of the scalar step-size $\alpha$,
the iterations
$$x^{(k+1)} = x^{(k)} - \alpha \nabla f(x^{(k)})$$
will converge (possibly slowly) to a minimum of the function $f(x)$.
\end{document}
\section{Lagrange Multipliers}
\newpage
\begin{problem}
Extend Example \ref{school-prog} for a collection of $S$ schools.
\end{problem}
\begin{problem}
Assume a farmer has $L$ feet of fencing for a rectangular area with lengths
$x$ and $y$. Determine these lengths such that the enclosed area is a maximum.
\end{problem}
\end{document}
%-----------------------------------------------------------------------
% End of chap4.tex
%-----------------------------------------------------------------------