##
## Using R as a toy laboratory for DSM research
##

##
## 1) Setup (read data files)
##

# read.table() is the generic R function for reading tabular data in various text formats
# (such as CSV and TAB-delimited text).  The data are loaded more efficiently if we tell
# R that all values are strings; we also need to make sure that R doesn't misinterpret a
# stray " character as the start of a (multi-line) quoted string.  Setting column names
# makes it easier to access the data.  Note that R can uncompress files on the fly; here
# we use the gzfile() function for GZip-compressed data.  If your browser has automatically
# uncompressed the file after download, you can simply omit the gzfile() call.  If you
# experience any problems with the gzfile() function (we heard such reports in R for Windows),
# try to uncompress the file outside R, and then load the resulting text file without gzfile().

tokens <- read.table(gzfile("bnc_vobj_filtered.txt.gz"), colClasses="character", quote="", col.names=c("verb", "noun"))

# The variable \texttt{tokens} now holds co-occurrence tokens as a table# (in R lingo, such tables are called data.frame)

# Size of the table (rows, columns) and first 10 rows
dim(tokens)
head(tokens, 10)

# Small illustration matrix for selected nouns and verbs
selected.nouns <- c("knife","cat","dog","boat","cup","pig")
selected.verbs <- c("get","see","use","hear","eat","kill")

##
## 2) Build co-occurrence matrix
##

# %in% operator tests whether value is contained in list;
# not the single & for logical "and" (vector operation)
tokens <- subset(tokens, verb %in% selected.verbs & noun %in% selected.nouns)

# How many co-occurrence tokens are left?
dim(tokens)
head(tokens, 10)

# Contstruct matrix of co-occurrence counts (contingency table)
M <- table(tokens$noun, tokens$verb)
M

# Use subscripts to extract row and column vectors
M["cat", ]
M[, "use"]

# For the calculating association scores, we need the marginal frequencies
# of the nouns and verbs; for simplicity, we obtain them by summing over the
# rows and columns of the table (this is not mathematically correct!)
f.nouns <- rowSums(M)
f.verbs <- colSums(M)
N <- sum(M)  # sample size (sum over all cells of the table)

# Calculate expected frequencies from row and column marginals
E <- f.nouns %*% t(f.verbs) / N
round(E, 1)

# Observed frequencies are simply the entries of M
O <- M

##
## 3) Feature scaling
##

# Because of Zipf's law, frequency distributions are highly skewed;
# DSM matrix M will be dominated by high-frequency entries

# Solution 1: transform into log-frequencies
M1 <- log10(M + 1)  # discounted (+1) to avoid log(0)
round(M1, 2)

# Solution 2: association scores, e.g. t-score
M2 <- (O - E) / sqrt(O + 1)  # discounted to avoid division by 0
round(M2, 2)

# "Sparse" association measures set all negative associations to 0;
# can be done with ifelse(), a vectorised if statement
M3 <- ifelse(O >= E, (O - E) / sqrt(O), 0) 
round(M3, 2)

# Pick your favourite scaling method here!
M <- M2

# A simple visualisation: plot nouns in two selected dimensions
M.2d <- M[, c("get", "use")]
round(M.2d, 2)

# Two-column matrix automatically interpreted as x- and y-coordinates
plot(M.2d, pch=20, col="red", main="DSM visualisation")
# Add labels: the text strings are the rownames of M
text(M.2d, labels=rownames(M.2d), pos=3)

#-- internal use: plot for lecture slides
par(cex=1.2, mar=c(4,4,2,1)+.1)
plot(M.2d, pch=19, col="red", main="DSM visualisation", xlim=c(-5,5), ylim=c(-10,10))
text(M.2d, labels=rownames(M.2d), pos=3)
dev.copy2pdf(file="dsm_lab_2d_plot.pdf", onefile=FALSE, bg="white")

##
## 4) Norms and distances
##

# Intuitive length of vector: Euclidean norm
# (NB: R function definitions look almost like the mathematical definitions)
euclid.norm <- function (x) sqrt(sum(x * x))

# Euclidean distance between x and y = norm of (x - y)
euclid.dist <- function (x, y) euclid.norm(x - y)

# Note: We will discuss alternative norms and distances on Thursday!

# Compute lengths (norms) of all row vectors
row.norms <- apply(M, 1, euclid.norm) # 1 = rows, 2 = columns
round(row.norms, 2)

# Normalisation: divide each row by its norm; this is a rescaling of the
# row "dimensions" and can be done by multiplication with a diagonal matrix
scaling.matrix <- diag(1 / row.norms)
round(scaling.matrix, 3)

M.norm <- scaling.matrix %*% M
round(M.norm, 2)

##
## 5) Nearest neighbours
##

# Matrix multiplication has lost the row labels (copy from M)
rownames(M.norm) <- rownames(M)

# To calculate distances of all terms e.g. from "dog", apply euclid.dist()
# function to rows, supplying the "dog" vector as fixed second argument
v.dog <- M.norm["dog",]
dist.dog <- apply(M.norm, 1, euclid.dist, y=v.dog)

# Now we can sort the vector of distances to find nearest neighbours
sort(dist.dog)

# R has a built-in function to compute a full distance matrix
distances <- dist(M.norm, method="euclidean")
round(distances, 2)

# If you want to search nearest neighbours, convert triangular distance
# matrix to full symmetric matrix and extract distance vectors from rows
dist.matrix <- as.matrix(distances)
sort(dist.matrix["dog",])

##
## 6) Clustering & semantic maps
##

# Distance matrix is also the basis for a cluster analysis
plot(hclust(distances))

# Visualisation as semantic map by projection into 2-dimensional space;
# uses non-linear multidimensional scaling (MDS)
library(MASS)
M.mds <- isoMDS(distances)$points

# Plot works as for the two selected dimensions above
plot(M.mds, pch=20, col="red", main="Semantic map", xlab="Dim 1", ylab="Dim 2")
text(M.mds, labels=rownames(M.mds), pos=3)

#-- internal use: plots for lecture slides
par(cex=1.4, mar=c(4,4,2,1)+.1)
plot(M.mds, pch=19, col="red", main="Semantic map", xlim=c(-1,1), ylim=c(-1,1), xlab="Dim 1", ylab="Dim 2")
text(M.mds, labels=rownames(M.mds), pos=3)
dev.copy2pdf(file="dsm_lab_semantic_map.pdf", onefile=FALSE, bg="white")

plot(hclust(distances), xlab="Euclidean distance")
dev.copy2pdf(file="dsm_lab_cluster.pdf", onefile=FALSE, bg="white")