16_2

% kolos 01.06.2016 u Matyasika
% oryginalny tekst znajduje si� tu:
% http://www.kernel-machines.org/publications/pdfs/0701907.pdf

\documentclass[leqno, a4paper,12pt]{article}
\usepackage[latin2]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amssymb} %
\usepackage{amsthm}  % matma
\usepackage{amsmath} %
\usepackage[english]{babel}
\usepackage{times}
\usepackage{anysize}
\usepackage{titlesec}

\usepackage{fancyhdr} % headery i footery

% definicje subsekcji i subsubsekcji
\titleformat{\subsection}[runin]
  {\normalfont\fontsize{12}{17}\slshape}
  {\thesubsection}
  {1em}
  {}

\titleformat{\subsubsection} [runin]
  {\normalfont\fontsize{12}{17}\slshape}
  {\thesubsubsection}
  {1em}
  {}

% numer sekcji zacznie si� od...
\setcounter{section}{4}
\setcounter{subsection}{1}
\setcounter{page}{37}
\setcounter{equation}{79}

% definicje headera i footera
\pagestyle{fancy}
\fancyhf{}
\renewcommand{\headrulewidth}{0pt} % brak kreski pod headerem

\chead{KERNEL METHODS IN MACHINE LEARNING}
\rhead{\thepage}


\begin{document}

\noindent summary, $F(\mathcal{S}) \sim \mathcal{N}(0,K)$. This induces a predictive model via Bayesian model integration according to

\begin{equation}
\label{80}
p(y|x;\mathcal{S}) = \int p(y|F(x,\cdot))p(F|\mathcal{S})dF,
\end{equation}

\noindent where $x$ is a test point that has been included in the sample (transductive setting).
For an i.i.d. sample, the log-posterior for $F$ can be written as

\begin{equation}
\label{81}
\ln p(F|\mathcal{S}) = - \frac{1}{2}F^T \boldsymbol{K}^{-1} F + \sum_{i=1}^{n}[f(x_i,y_i)-g(x_i,F)] + const.
\end{equation}

\noindent Invoking the representer theorem for $\hat{F}(\mathcal{S}) := \arg\max _F\ln p(F|\mathcal{S})$, we know
that

\begin{equation}
\label{82}
\hat{F}(\mathcal{S})_{iy} = \sum_{j=1}^{n}\sum_{y'\in \mathcal{Y}} \alpha_{iy} K_{iy,jy'},
\end{equation}

\noindent which we plug into equation (\ref{81}) to arrive at

\begin{equation}
\label{83}
\min\limits_\alpha \alpha^T \boldsymbol{K}\alpha - \sum_{i=1}^{n}\bigg(\alpha^T \boldsymbol{K}e_{iy'}+\log \sum_{y \in \mathcal{Y}} \exp[\alpha^T \boldsymbol{K}e_iy]\bigg),
\end{equation}

\noindent where $e_{iy}$ denotes the respective unit vector. Notice that for
$ f(\cdot) = \sum_{i,y} \alpha_{iy} k(\cdot,(x_i,y)) $
the first term is equivalent to the squared RKHS norm of $f \in \mathcal{H} $ since \\
$ \langle f,f \rangle _\mathcal{H} = \sum_{i,j} \sum_{y,y'} \alpha_{iy} \alpha_{jy'} \langle k(\cdot,(x_i,y)),k(\cdot,(x_j,y'))\rangle $.
The latter inner product reduces to $ k((x_i,y),(x_j,y')) $
due to the reproducing property. Again, the
key issue in solving (\ref{83}) is how to achieve spareness in the expansion for $\hat{F}$.

\subsection{Markov networks and kernels.} In Section 4.1 no assumptions about
the specific structure of the joint kernel defining the model in equation
(70) has been made. In the following, we will focus on a more specific
setting with multiple outputs, where dependencies are modeled by a
conditional independence graph. This approach is motivated by the fact
that independently predicting individual responses based on marginal
response models will often be suboptimal and explicitly modeling these
interactions can be of crucial importance.


\subsubsection{Markov networks and factorization theorem.} Denote predictor
variables by $X$, response variables by $Y$ and define $Z := (X,Y)$
with associated sample space $\mathcal{Z}$. We use Markov networks as the modeling formalism for representing dependencies between covariates and
response variables, as well as interdependencies among response variables.

\end{document}