--- title: "ETC3250/5250: Dimension reduction" subtitle: "Semester 1, 2020" author: "
Professor Di Cook

Monash University" date: "Week 4 (b)" output: xaringan::moon_reader: css: ["kunoichi", "ninjutsu", "mystyle.css", "libs/animate.css"] lib_dir: libs nature: ratio: '16:9' highlightStyle: github highlightLines: true countIncrementalSlides: false --- {r setup, include=FALSE} library(knitr) knitr::opts_chunkset(tidy = FALSE, message = FALSE, warning = FALSE, echo = FALSE, fig.width=8, fig.height=6, fig.align = "center", fig.retina = 4) options(htmltools.dir.version = FALSE) library(magick)  ## PCA vs LDA .tip[.orange[Discriminant space]: is the low-dimensional space where the class means are the furthest apart relative to the common variance-covariance.] The discriminant space is provided by the eigenvectors after making an eigen-decomposition of\Sigma^{-1}\Sigma_B$, where $$\small{\Sigma_B = \frac{1}{K}\sum_{i=1}^{K} (\mu_i-\mu)(\mu_i-\mu)'} ~~~\text{and}~~~ \small{\Sigma = \frac{1}{K}\sum_{k=1}^K\frac{1}{n_k}\sum_{i=1}^{n_k} (x_i-\mu_k)(x_i-\mu_k)'}$$ --- class: split-two layout: false .column[.pad50px[ ## Mahalanobis distance For two$p$-dimensional vectors, Euclidean distance is $$d(x,y) = \sqrt{(x-y)'(x-y)}$$ and Mahalanobs distance is $$d(x,y) = \sqrt{(x-y)'\Sigma^{-1}(x-y)}$$ Which points are closest according to .orange[Euclidean] distance? Which points are closest relative to the .orange[variance-covariance]? ]] .column[.content.vmiddle.center[ {r} countdown::countdown(minutes=0, seconds=30, left = 0, right = 0, padding = "1px", margin = "1%", font_size = "1em", style = "position: relative; width: min-content;")  {r} # Utility functions library(tidyverse) f.norm.vec<-function(x) { x<-x/f.norm(x) x } f.norm<-function(x) { sqrt(sum(x^2)) } f.gen.sphere<-function(n=100,p=5) { x<-matrix(rnorm(n*p),ncol=p) xnew<-t(apply(x,1,f.norm.vec)) xnew } f.vc.ellipse <- function(vc, xm, n=500) { p<-ncol(vc) x<-f.gen.sphere(n,p) evc<-eigen(vc) vc2<-(evc$vectors)%*%diag(sqrt(evc$values))%*%t(evc$vectors) x<-x%*%vc2 x + matrix(rep(xm, each=n),ncol=p) } df <- f.vc.ellipse(vc=matrix(c(1,1.2,1.2,2), ncol=2), xm=c(0,0), n=1000) df <- as_tibble(df)  {r} pts <- tibble(V1=c(0, -0.5, -0.8), V2=c(0, 0.5, -1.1), label=c("A", "B", "C")) ggplot(df, aes(x=V1, y=V2)) + geom_point() + geom_point(data=pts, aes(x=V1, y=V2, colour=label)) + geom_text(data=pts, aes(x=V1, y=V2, label=label), nudge_x = 0.1, nudge_y = 0.1) + scale_colour_brewer("", palette="Dark2") + xlim(c(-1.5, 1.5)) + ylim(c(-1.5, 1.5)) + theme(legend.position = "none", aspect.ratio=1)  ]] --- ## Discriminant space Both means the same. Two different variance-covariance matrices. .purple[Discriminant space] depends on the variance-covariance matrix. {r out.width="70%", fig.width=8} library(gridExtra) df1 <- f.vc.ellipse(vc=matrix(c(1,1.2,1.2,2), ncol=2), xm=c(0,0), n=1000) df1 <- as_tibble(df1) df2 <- f.vc.ellipse(vc=matrix(c(1,-0.3,-0.3,0.5), ncol=2), xm=c(0,0), n=1000) df2 <- as_tibble(df2) means <- tibble(V1=c(0.5, -0.5), V2=c(-0.5, 0.5), label=c("mu1", "mu2")) df3 <- df1 %>% mutate(V1=V1+means$V1, V2=V2+means$V2) df4 <- df1 %>% mutate(V1=V1+means$V1, V2=V2+means$V2) df <- bind_rows(df3, df4) p1 <- ggplot(df, aes(x=V1, y=V2)) + geom_point() + geom_point(data=means, aes(x=V1, y=V2, colour=label)) + geom_text(data=means, aes(x=V1, y=V2, label=label), nudge_x = 0.2, nudge_y = 0.2) + geom_abline(intercept=0, slope=-0.67, colour="purple") + scale_colour_brewer("", palette="Dark2") + xlim(c(-2, 2)) + ylim(c(-2, 2)) + theme(legend.position = "none", aspect.ratio=1) + ggtitle("Scenario 1") df3 <- df2 %>% mutate(V1=V1+means$V1, V2=V2+means$V2) df4 <- df2 %>% mutate(V1=V1+means$V1, V2=V2+means$V2) df <- bind_rows(df3, df4) p2 <- ggplot(df, aes(x=V1, y=V2)) + geom_point() + geom_point(data=means, aes(x=V1, y=V2, colour=label)) + geom_text(data=means, aes(x=V1, y=V2, label=label), nudge_x = 0.2, nudge_y = 0.2) + geom_abline(intercept=0, slope=3.03, colour="purple") + scale_colour_brewer("", palette="Dark2") + xlim(c(-1.7, 1.7)) + ylim(c(-1.7, 1.7)) + theme(legend.position = "none", aspect.ratio=1) + ggtitle("Scenario 2") grid.arrange(p1, p2, ncol=2)  {r eval=FALSE} # This code helps estimate the slope in the above diagram library(mvtnorm) library(MASS) mydat1 <- data.frame(rbind(rmvnorm(250, mean=c(0.5, -0.5), sigma=matrix(c(1,1.2,1.2,2), ncol=2)), rmvnorm(250, mean=c(-0.5, 0.5), sigma=matrix(c(1,1.2,1.2,2), ncol=2)))) mydat1$class <- c(rep(1, 250), rep(2, 250)) lda(class~X1+X2, data=mydat1) mydat2 <- data.frame(rbind( rmvnorm(250, mean=c(0.5, -0.5), sigma=matrix(c(1,-0.3,-0.3,0.5), ncol=2)), rmvnorm(250, mean=c(-0.5, 0.5), sigma=matrix(c(1,-0.3,-0.3,0.5), ncol=2)))) mydat2$class <- c(rep(1, 250), rep(2, 250)) lda(class~X1+X2, data=mydat2)  --- ## Projection pursuit (PP) generalises PCA .green[PCA:] $$\mathop{\text{maximize}}_{\phi_{11},\dots,\phi_{p1}} \frac{1}{n}\sum_{i=1}^n \left(\sum_{j=1}^p \phi_{j1}x_{ij}\right)^{\!\!\!2} \text{ subject to } \sum_{j=1}^p \phi^2_{j1} = 1$$ .green[PP:] $$\mathop{\text{maximize}}_{\phi_{11},\dots,\phi_{p1}} f\left(\sum_{j=1}^p \phi_{j1}x_{ij}\right) \text{ subject to } \sum_{j=1}^p \phi^2_{j1} = 1$$ --- ## MDS .tip[.orange[Multidimensional scaling (MDS)] finds a low-dimensional layout of points that minimises the difference between distances computed in the *p*-dimensional space, and those computed in the low-dimensional space. ] $$\mbox{Stress}_D(x_1, ..., x_N) = \left(\sum_{i, j=1; i\neq j}^N (d_{ij} - d_k(i,j))^2\right)^{1/2}$$ where $D$ is an $N\times N$ matrix of distances $(d_{ij})$ between all pairs of points, and $d_k(i,j)$ is the distance between the points in the low-dimensional space. --- class: split-two ## MDS .column[.pad50px[
- Classical MDS is the same as PCA - Metric MDS incorporates power transformations on the distances, $d_{ij}^r$. - Non-metric MDS incorporates a monotonic transformation of the distances, e.g. rank {r echo=TRUE} track <- read_csv("data/womens_track.csv") track_mds <- cmdscale(dist(track[,1:7])) %>% as_tibble() %>% mutate(country = track$country)  ]] .column[.pad50px[ {r} library(ggrepel) ggplot() + geom_point(data=track_mds, aes(x=V1, y=V2)) + geom_text_repel(data=filter(track_mds, V1>50), aes(x=V1, y=V2, label=country)) + geom_text_repel(data=filter(track_mds, abs(V2)>3.5), aes(x=V1, y=V2, label=country)) + xlab("MDS1") + ylab("MDS2") + theme(aspect.ratio=1) + ggtitle("Classical MDS")  ]] --- # Challenge For each of these distance matrices, find a layout in 1 or 2D that accurately reflects the full distances. {r} d1 <- tibble(name = c("A", "B", "C"), A = c(0.1, 3.2, 3.9), B=c(3.2, -0.1, 5.1), C=c(3.9, 5.1, 0)) d1 d2 <- tibble(name = c("A", "B", "C", "D"), A = c(0.1, 0.9, 2.1, 3.0), B=c(0.9, 0.0, 1.1, 1.9), C=c(2.1,1.1,0.1,1.1), D=c(3.0,1.9,1.1,-0.1)) d2  {r} countdown::countdown(minutes=0, seconds=30, left = 0, right = 0, padding = "1px", margin = "1%", font_size = "1em", style = "position: relative; width: min-content;")  --- ## Non-linear dimension reduction - .orange[T-distributed Stochastic Neighbor Embedding (t-SNE)]: similar to MDS, except emphasis is placed on grouping observations into clusters. Observations within a cluster are placed close in the low-dimensional representation, but clusters themselves are placed far apart. --- ## Non-linear dimension reduction - .orange[Local linear embedding (LLE)]: Finds nearest neighbours of points, defines interpoint distances relative to neighbours, and preserves these proximities in the low-dimensional mapping. Optimisation is used to solve an eigen-decomposition of the knn distance construction. --- ## Non-linear dimension reduction - .orange[Self-organising maps (SOM)]: First clusters the observations into$k \times k\$ groups. Uses the mean of each group laid out in a constrained 2D grid to create a 2D projection. --- layout: false # r set.seed(2022); emo::ji("technologist") Made by a human with a computer ### Slides at [https://iml.numbat.space](https://iml.numbat.space). ### Code and data at [https://github.com/numbats/iml](https://github.com/numbats/iml). 