summaryrefslogtreecommitdiff
path: root/classification_finie/finit_classif.tex
diff options
context:
space:
mode:
authorJan Aalmoes <jan.aalmoes@inria.fr>2024-01-07 18:42:27 +0100
committerJan Aalmoes <jan.aalmoes@inria.fr>2024-01-07 18:42:27 +0100
commit103677f1a14fe1aec281a69e5d68bbc72335dd9e (patch)
tree91fed7e0b2549410163ff8106b410ee9b3a4a596 /classification_finie/finit_classif.tex
parent3ce895b1697e22fa325f962d3bcae0b78eca321a (diff)
classification finie en anglais
Diffstat (limited to 'classification_finie/finit_classif.tex')
-rw-r--r--classification_finie/finit_classif.tex458
1 files changed, 458 insertions, 0 deletions
diff --git a/classification_finie/finit_classif.tex b/classification_finie/finit_classif.tex
new file mode 100644
index 0000000..44aa173
--- /dev/null
+++ b/classification_finie/finit_classif.tex
@@ -0,0 +1,458 @@
+
+\subsection{Notations}
+\begin{itemize}
+ \item $(x_0,x_1,\cdots,x_n)\in X_0\times X_1 \times\cdots\times X_n$
+ \item If $A$ is a finite set, then $\# A$ denotes the cardinal number of $A$
+ \item $f:\left\{\begin{matrix}A&\rightarrow &B\\ a&\mapsto & f(b)\end{matrix}\right.$
+ Denotes a function from $A$ to $B$ mapping each element $a$ in $A$ to $f(a)$ in $B$.
+ \item $f\circ g$ is the composition of $f$ and $g$.
+ \item $f^{-1}$ can be either the inverse function of $f$ if $f$ is a bijection or its inverse image otherwise.
+\end{itemize}
+
+\subsection{Problem setup}
+We dispose of two finite sets, a feature space $E$ and a classification space $F$.
+The cardinal numbers of $E$ and $F$ are respectively $m\in\mathbb{N}^*$ and $n\in\mathbb{N}^*$.
+Let $\varphi$ be a bijection from $E$ to $[|0,m-1|]$ and $\psi$ be a bijection from $F$ to $[|0,n-1|]$.
+We also dispose of a $o$-tuple $d: [|0,o-1] \rightarrow E\times F$.
+In machine learning we would say that $d$ models a dataset.
+We build the "dataset of indices" as such :
+\begin{equation*}
+ d' : \left\{
+ \begin{matrix}
+ [|0,o-1|]&\longrightarrow&[|0,m-1|]\times[|0,n-1|]\\
+ i&\mapsto&\left(\varphi(d_0(i)),\psi(d_1(i))\right)
+ \end{matrix}
+ \right.
+\end{equation*}
+
+\begin{definition}
+ \label{def:BA}
+ The balanced accuracy of $f$ on the $o$-tuple $d$ relatively to the set $F$, $BA_F^d(f)$, is a number in $[0,1]$ such that
+ \begin{equation*}
+ BA_F^d(f) = \frac{1}{n}
+ \sum_{y\in F}
+ \frac{
+ \#\left\{j\in [|0,o-1|]\quad| f(d_0(j))=d_1(j)\text{ and }d_1(j) = y\right\}
+ }
+ {\#\{j\in [|0,o-1|]\quad| d_1(j)=y\}}
+ \end{equation*}
+
+\end{definition}
+\textbf{The problem consists in finding an application $f:E\rightarrow F$ such that the balanced accuracy of $f$ on $d$ is maximal.}
+
+\subsection{From a problem on elements to a problem on indices}
+We call the set of functions from $E$ to $F$, $B_{E\rightarrow F}$.
+To simplify notation, the set of function from $[|0,m-1|]$ to $[|0,n-1|]$ we call it $B_{m\rightarrow n}$.
+
+\begin{theorem}
+ \label{th:bij}
+ Let $E$ and $F$ be two finite sets of cardinal numbers $m$ and $n$. There exists a bijection from $B_{E\rightarrow F}$ to $B_{m\rightarrow n}$.
+\end{theorem}
+
+\begin{proof}
+ We proceed by expliciting such a bijection.
+ Let
+ \begin{equation}
+ \Phi :\left\{
+ \begin{matrix}
+ B_{E\rightarrow F} &\longrightarrow& B_{m\rightarrow n}&\\
+ f&\mapsto &\psi\circ f\circ\varphi^{-1}
+ \end{matrix}
+ \right.
+ \end{equation}
+
+ Let's show that $\Phi$ is an injection.
+ Let $(u,v)\in \left(B_{E\rightarrow F}\right)^2$ such that
+ $\Phi(u) = \Phi(v)$.
+ Then
+ \begin{align*}
+ & \psi\circ u\circ\varphi^{-1} = \psi\circ v\circ\varphi^{-1}\\
+ \Leftrightarrow& \psi^{-1}\circ\psi\circ u\circ\varphi^{-1} = \psi^{-1}\circ\psi\circ v\circ\varphi^{-1}\\
+ \Leftrightarrow&u\circ\varphi^{-1} = v\circ\varphi^{-1}\\
+ \Leftrightarrow&u\circ\varphi^{-1}\circ\varphi = v\circ\varphi^{-1}\circ\varphi\\
+ \Leftrightarrow&u = v\\
+ \end{align*}
+ Hence $\Phi$ is injective. Let's show that $\Phi$ is surjective.
+ Let $g\in B_{m\rightarrow n}$.
+ Then
+ $\Phi(\psi^{-1}\circ g\circ\varphi) =
+ \psi\circ\psi^{-1}\circ g\circ\varphi\circ\varphi^{-1} = g$
+ Hence $\Phi$ is surjective.
+ In conclusion $\Phi$ is both injective and surjective: it is a bijection.
+\end{proof}
+
+$\varphi$ and $\psi$ can be seen as indices on $E$ and $F$.
+For instance, each element $e$ in $E$ has a unique index $\varphi(e)$.
+This abstraction step allows us to build explicit functions from $E$ to $F$ without taking into consideration the type of mathematical object that contains those sets.
+ Indeed, theorem \ref{th:bij} gives us that for each function mapping indices of $E$ to indices on $F$ we can find a unique function from $E$ to $F$.
+ And the proof even gives us how to find it: by using $\Phi^{-1}$.
+
+ Let's now explore how the balanced accuracy behaves when composing with $\Phi$.
+\begin{theorem}
+ \label{th:BAphi=BA}
+ Let $E$ and $F$ two finite sets.
+ Let $d$ a tuple of $E\times F$.
+ We have the following equality
+ \begin{equation*}
+ BA^{d'}_{[|0,\#F-1|]}\circ\Phi = BA^d_F
+ \end{equation*}
+\end{theorem}
+
+\begin{proof}
+ Let $E$ and $F$ two finite sets.
+ We have two bijections :
+ $\varphi$ from E to $[|0,\#E-1|]$ and
+ $\psi$ from F to $[|0,\#F-1|]$.
+ With those two functions we build a third bijection
+ $\Phi$ from $B_{E\rightarrow F}$ to $B_{\#E\rightarrow \#F }$ similarly as in the proof of theorem \ref{th:bij}.
+ Let $o\in\mathbb{N}^*$ and $d$ a $o$-tuple of $E\times F$.
+
+ Let $f\in B_{E\rightarrow F}$
+ then
+ \begin{equation}
+ \label{eq:BAdp}
+ \left(BA^{d'}_{[|0,\#F-1|]}\circ\Phi\right)(f) =
+ \frac{1}{\#F}
+ \sum_{i=0}^{\#F-1}\frac{
+ \#\left\{j\in[|0,o-1|]\quad | \Phi(f)(d'_0(j))=d'_1(j)\text{ and }d'_1(j)=i\right\}}
+ {\#\left\{j\in[|0,o-1|]\quad | d'_1(j)=i\right\}}\\
+ \end{equation}
+
+ We also remark that
+ \begin{equation*}
+ \Phi(f)\circ d'_0= 
+ \psi\circ f\circ\varphi^{-1}\circ d'_0 =
+ \psi\circ f\circ\varphi^{-1}\circ \varphi\circ d_0=
+ \psi\circ f\circ d_0
+ \end{equation*}
+
+ Hence, let $j\in[|0,o-1|]$
+ \begin{align*}
+ &\left(\Phi(f)\circ d'_0\right)(j) = d'_1(j)\\
+ \Leftrightarrow &\left(\psi\circ f\circ d_0\right)(j)= d'_1(j)\\
+ \Leftrightarrow &\left(\psi\circ f\circ d_0\right)(j) = \psi\circ d_1(j)\\
+ \Leftrightarrow &\left(f\circ d_0\right)(j) = d_1(j)\\
+ \end{align*}
+
+ Which gives us the following assertion:
+ \begin{equation}
+ \label{eq:d1j}
+ \forall j\in[|0,o-1|]\quad
+ \left[
+ (\Phi(f)\circ d'_0)(j) = d'_1(j)
+ \Leftrightarrow
+ (f\circ d_0)(j) = d_1(j)\\
+ \right]
+ \end{equation}
+
+ Let's now do the same work of switching from indices to elements on "$d'_1(j) = i$".
+ Let $i\in[|0,\#F-1|]$ and $j\in[|0,o-1|]$.
+ \begin{align*}
+ &d'_1(j) = i\\
+ \Leftrightarrow & (\psi\circ d_1)(j) = i\\
+ \Leftrightarrow & d_1(j) = \psi^{-1}(i)
+ \end{align*}
+
+ Hence from equations \ref{eq:BAdp} and \ref{eq:d1j} we obtain
+
+ \begin{align*}
+ &\left(BA^{d'}_{[|0,\#F-1|]}\circ\Phi\right)(f) =
+ \frac{1}{\#F}
+ \sum_{y=0}^{\#F-1}\frac{
+ \#\left\{j\in[|0,o-1|]\quad | f(d_0(j))=d_1(j)\text{ and }d_1(j)=\psi^{-1}(i)\right\}}
+ {\#\left\{j\in[|0,o-1|]\quad | d_1(j)=\psi^{-1}(i)\right\}}\\
+ &=
+ \frac{1}{\#F}
+ \sum_{y=\psi^{-1}(0),\cdots,\psi^{-1}(\#F-1)}\frac{
+ \#\left\{j\in[|0,o-1|]\quad | f(d_0(j))=d_1(j)\text{ and }d_1(j)=y\right\}}
+ {\#\left\{j\in[|0,o-1|]\quad | d_1(j)=y\right\}}\\
+ &=
+ \frac{1}{\#F}
+ \sum_{y\in F}\frac{
+ \#\left\{j\in[|0,o-1|]\quad | f(d_0(j))=d_1(j)\text{ and }d_1(j)=y\right\}}
+ {\#\left\{j\in[|0,o-1|]\quad | d_1(j)=y\right\}}\\
+ \end{align*}
+
+ The final quantity in this sequence of equalities is equal to $BA_F^d(f)$ according to definition \ref{def:BA}.
+\end{proof}
+
+Using theorem \ref{th:BAphi=BA} we can deduce the following result which will be key to finding $\text{argmax}\left(BA_F^d\right)$.
+
+\begin{corollary}
+ \label{co:argmax}
+ \begin{equation*}
+ \text{argmax}\left(BA_F^d\right)
+ = \Phi^{-1}\left(\text{argmax}\left(BA_{[|0,\#F-1|]}^{d'}\right)\right)
+ \end{equation*}
+\end{corollary}
+
+\begin{proof}
+ Let $f' = \text{argmax}\left(BA_{[|0,\#F-1|]}^{d'}\right)$.
+ Then for all $g$ in $B_{E\rightarrow F}$,
+ $BA_F^d(g) = BA_{[|0,\#F-1|]}^{d'}(\Phi(g)) \leq
+ BA_{[|0,\#F-1|]}^{d'}(f') = BA_F^d(\Phi^{-1}(f'))$
+\end{proof}
+
+With corollary \ref{co:argmax} we have that, to solve the classification problem on any dataset, it is sufficient to solve on one of the corresponding indices dataset.
+The focus of the next section is on finding an algorithm to solve such a problem.
+
+\subsection{Building a classification algorithm on $B_{m\rightarrow n}$}
+Let $m$, $n$ and $o$ be natural numbers greater than zero.
+We take $d$, a $o$-tuple of $[|0,m-1|]\times[|0,n-1|]$.
+Since we already know that we are going to work on indices, we don't bother with the general sets $E$ and $F$ from the previous section.
+Instead we use $E=\{0,1,\cdots,m-1\}$ and $F=\{0,1,\cdots,n-1\}$.
+
+The direct approach to find an algorithm that maximizes $BA_{[|0,n-1|]}^d$ would be to compute the balanced accuracy for every function of $B_{m\rightarrow n}$.
+This method works fine for small values of $m$ and $n$ but becomes quickly impossible to compute as those values increase.
+Indeed, we know from combinatorics that $B_{m\rightarrow n}$ contains $n^m$ elements.
+It results in an algorithm with a number of $\mathcal{O}(on^m)$ operations.
+Instead, we propose an algorithm in $\mathcal{O}(onm)$ operations.
+
+To build it we are going to "distribute" the argmax operator, simplifying the expression of the optimal balanced accuracy.
+This distribution requires to find an expression of the balanced accuracy that is context-wise more appropriate.
+We express this alternative form of the balanced accuracy in the following lemma.
+
+\begin{lemma}
+ \label{lem:sumei}
+ For every $i$ in $[|0,m-1|]$, we introduction the following $n$-tuple:
+ \begin{equation*}
+ e_i:\left\{
+ \begin{matrix}
+ [|0,n-1|]&\longrightarrow&\mathbb{N}\\
+ l&\mapsto&
+ \frac{
+ \#\{j\in[|0,o-1|]\quad| d_0(j)=i\text{ and }d_1(j)=l\}
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=l\}
+ }\\
+ \end{matrix}
+ \right.
+ \end{equation*}
+
+ Then we can write the balanced accuracy as:
+ \begin{equation*}
+ BA_{[|0,n-1|]}^d(h) = \frac{1}{n}
+ \sum_{i=0}^{m-1} e_i(h(i))
+ \end{equation*}
+ Where $h\in B_{m\rightarrow n}$.
+\end{lemma}
+\begin{proof}
+ Let $l\in[|0,n-1|]$ and $h$, a function in $B_{m\rightarrow n}$.
+ \begin{align*}
+ &
+ \frac{
+ \#\{j\in[|0,o-1|]\quad| h(d_0(j))=d_1(j)\text{ and }d_1(j)=l\}
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=l\}
+ }\\
+ =&
+ \frac{
+ \#\{j\in[|0,o-1|]\quad| h(d_0(j))=l\text{ and }d_1(j)=l\}
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=l\}
+ }
+ \end{align*}
+ \begin{align*}
+ =&
+ \frac{
+ \#\{j\in[|0,o-1|]\quad| h(d_0(j))=l\text{ and }d_1(j)=h(d_0(j))\}
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=l\}
+ }
+ \end{align*}
+ \begin{align}
+ \label{eq:sansi}
+ =&
+ \frac{
+ \#\left(\{j\in[|0,o-1|]\quad| h(d_0(j))=l\}\cap\{j\in[|0,o-1|]\quad| d_1(j)=h(d_0(j))\}\right)
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=l\}
+ }
+ \end{align}
+
+ In the previous expression, $l$ refers to an element of the label set $F$.
+ To prove the result we substitute in equation \ref{eq:sansi} $h(d_0(j))$ by an expression containing $i$: an element of $E$.
+ The end goal is to exhibit the quantity of interest $e_{i,j}$.
+ We first remark that for all $j$ in $[|0,o-1|]$
+ \begin{equation*}
+ h(d_0(j))=l
+ \Leftrightarrow d_0(j)\in h^{-1}(\{l\})
+ \Leftrightarrow \exists i\in h^{-1}(\{l\}), d_0(j)=i
+ \end{equation*}
+
+ Which means that
+ \begin{align*}
+ &\left\{
+ j\in[|0,o-1|]\quad| h(d_0(j)) = l
+ \right\}\\
+ =&\left\{
+ j\in[|0,o-1|]\quad| \exists i\in h^{-1}(\{l\}), d_0(j)=i
+ \right\}\\
+ =&\bigcup_{i\in h^{-1}(\{l\})}
+ \left\{
+ j\in[|0,o-1|]\quad| d_0(j)=i
+ \right\}
+ \end{align*}
+
+ Hence, by substitution of $\{j\in[|0,o-1|]\quad| h(d_0(j)) = l\}$ en equation \ref{eq:sansi}, we obtain
+ \begin{align*}
+ &\frac{
+ \#\left(
+ \left(
+ \bigcup_{i\in h^{-1}(\{l\})}
+ \left\{
+ j\in[|0,o-1|]\quad| d_0(j)=i
+ \right\}
+ \right)
+ \cap\{j\in[|0,o-1|]\quad| d_1(j)=h(d_0(j))\}
+ \right)
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=l\}
+ }\\
+ =&\frac{
+ \#\left(
+ \bigcup_{i\in h^{-1}(\{l\})}
+ \left\{
+ j\in[|0,o-1|]\quad| d_0(j)=i
+ \text{ and }
+ d_1(j)=h(d_0(j))
+ \right\}
+ \right)
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=l\}
+ }\\
+ =&\sum_{i\in h^{-1}(\{l\})}
+ \frac{
+ \#\left\{
+ j\in[|0,o-1|]\quad| d_0(j)=i
+ \text{ and }
+ d_1(j)=h(i)
+ \right\}
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=h(i)\}
+ }
+ \end{align*}
+ \begin{align}
+ \label{eq:sumei}
+ =&\sum_{i\in h^{-1}(\{l\})}
+ e_i(h(i))
+ \end{align}
+
+ Then, according to definition \ref{def:BA}
+ \begin{equation*}
+ BA_{[|0,n-1|]}^d(h) = \frac{1}{n}
+ \sum_{l=0}^{n-1}
+ \frac{
+ \#\left\{j\in [|0,o-1|]\quad| h(d_0(j))=d_1(j)\text{ and }d_1(j) = l\right\}
+ }
+ {\#\{j\in [|0,o-1|]\quad| d_1(j)=l\}}
+ \end{equation*}
+ We substitute the general term of this sum by the result obtained in equation \ref{eq:sumei}:
+
+ \begin{align*}
+ &BA_{[|0,n-1|]}^d(h)\\
+ =&\frac{1}{n}
+ \sum_{l=0}^{n-1}\sum_{i\in h^{-1}(\{l\})} e_i(h(i))\\
+ =&\frac{1}{n}
+ \sum_{l=0}^{n-1}\sum_{i=0}^{m-1}1_{h^{-1}(\{l\})}(i) e_i(h(i))\\
+ =&\frac{1}{n}
+ \sum_{i=0}^{m-1} e_i(h(i))\sum_{l=0}^{n-1}1_{h^{-1}(\{l\})}(i)\\
+ \end{align*}
+
+ Since $1_{h^{-1}(\{l\})}(i) = 1$ if and only if $l=h(i)$,
+ $\sum_{l=0}^{n-1}1_{h^{-1}(\{l\})}(i) = 1$.
+
+ Hence we have the expected result.
+\end{proof}
+
+This lemma allows us to shift the computing of the argmax from all possible functions of $B_{m\rightarrow n}$ to the entries of the matrix $M = \left(e_i(l)\right)_{i\in[|0,m-1|],l\in[|0,m-1|]}$.
+More precisely, we find the maximum of each row of $M$ resulting in parcouring once every element of $M$.
+We formalise this idea in the following theorem.
+
+\begin{theorem}
+ Let $e_i$ be the following $n$-tuples of $\mathbb{N}$:
+ \begin{equation*}
+ e_i:\left\{
+ \begin{matrix}
+ [|0,n-1|]&\longrightarrow&\mathbb{N}\\
+ l&\mapsto&
+ \frac{
+ \#\{j\in[|0,o-1|]\quad| d_0(j)=i\text{ and }d_1(j)=l\}
+ }{
+ \#\{j\in[|0,o-1|]\quad| d_1(j)=l\}
+ }\\
+ \end{matrix}
+ \right.
+ \end{equation*}
+ Let $f\in B_{m\rightarrow n}$ such that for all $i$ in $[|0,m-1|]$
+ \begin{equation*}
+ f(i) = \text{argmax}\left(e_i\right)
+ \end{equation*}
+
+ Then
+ \begin{equation*}
+ f = \text{argmax}\left(BA_{[|0,n-1|]}^d\right)
+ \end{equation*}
+\end{theorem}
+
+\begin{proof}
+ Let $g\in B_{m\rightarrow n}$.
+ We are going to show that $BA_{[|0,n-1|]}^d(g)\leq BA_{[|0,n-1|]}^d(f)$.
+ We start by saying that
+ for all $i\in[|0,n-1|]$, $0\leq e_i(g(i))\leq e_i(f(i))$.
+ Which gives us that
+ \begin{equation*}
+ \sum_{i=0}^{m-1}e_i(g(i)) \leq \sum_{i=0}^{m-1}e_i(f(i))
+ \end{equation*}
+ and hence
+ \begin{equation*}
+ \frac{1}{n}\sum_{i=0}^{m-1}e_i(g(i)) \leq \frac{1}{n}\sum_{i=0}^{m-1}e_i(f(i))
+ \end{equation*}
+
+ And finally, according to lemma \ref{lem:sumei} we have the result.
+\end{proof}
+
+According to this result, we can write the following algorithm in $\mathcal{O}(onm)$ to solve our optimisation problem.
+
+\begin{algorithm}
+ \caption{Optimisation: finding $\text{argmax}\left(BA^d_{[|0,n-1|]}\right)$}
+ \label{algo:argmax}
+ \begin{algorithmic}
+ \For{$i\gets 0,\cdots,m-1$}
+ \For{$l\gets 0,\cdots,n-1$}
+ \State $e_{i,l}\gets
+ \frac{
+ \#\{j\in[|0,o-1|]\quad | d_0(j)=i\text{ and }d_1(j)=l\}
+ }{
+ \#\{j\in[|0,o-1|]\quad | d_1(j)=l\}
+ }$
+ \footnotesize
+ \Comment{Compute $e_i(l)$}
+ \normalsize
+ \EndFor
+ \EndFor
+ \For{$i\gets 0,\cdots,n-1$}
+ \State $f(i)\gets\text{argmax}_l(e_{i,l})$
+ \footnotesize
+ \Comment{Value of $l$ that maximizes $e_{i,l}$}
+ \normalsize
+ \EndFor
+ \State \Return $f$
+ \end{algorithmic}
+\end{algorithm}
+
+\FloatBarrier
+\subsection{Extention to unseen data}
+Alogrithm \ref{algo:argmax} is an efficient algorithm to find a classifier the maximizes balanced accuracy on the set of indices.
+From the result $f$ of this alogrithm we find a classifier that solves the problem of maximizing the balanced accuracy on element by applying the inversse of $\Phi$.
+Hence $\Phi^{-1}(f)$ is solution.
+Computing it requires $\mathcal{O}(on)$ operations resulting in an overall complexity of $\mathcal{O}(onm)$.
+
+This classifier algorithm is limited to finite feature space but there are cases where we can find workaround to still use it.
+For instance, by using clusturing prior to our method we can reduce to a finit feature space.
+Also, if $(E, O)$ is a sub-topology we can match any element of the englobing set to its nearest counterpart in $E$.
+We did that on LAW and COMPAS dataset and compare our approach to a random forest.
+
+
+The main takeaway from figures \ref{fig:ba} and \ref{fig:time} is that our finite classifier alogirthm outperforms state of the art in terms of balanced accuracy and is way faster at achieving this result.
+