##
##  Build window-based term-term DSM from 5 M word corpus entirely in R
##

##
## 1) Setup: load & preprocess data
##

# Our corpus is a 5 M word collection of Harry Potter fan fiction mined from the Web.
# It has been lemmatised and part-of-speech tagged with TreeTagger (standard parameter file).
# "potter_tokens.txt.gz" contains this corpus in one-word-per-line format, aka. "verticalised text";
# each line corresponds to a single token given in the form <lemma>_<pos>, e.g. "mouse_NOUNT" for "mice".

# Read the compressed file into a vector of lines (a vector of mode "character" in R).
potter.tokens <- readLines(gzfile("potter_tokens.txt.gz"))

# NB: it is possible that your Web browser automatically decompressed the file after the download
# (e.g., Safari on Mac OS X has an unfortunate tendency to do this).  In this case, you have to omit
# the gzfile(...) function from the call:
#   potter.tokens <- readLines("potter_tokens.txt")

# Now check that we have all 5 million tokens and take a first look at the corpus:
length(potter.tokens) # should be 5,000,010
head(potter.tokens, 200)

# If you want to try and read it is text, you have to concatenate the tokens with paste()
cat(paste(potter.tokens[1:200], sep=" "), "\n")

# You'll notice lots of tokens that are simply given as "*" in the text.  These include all
# punctuation symbols, numbers and words containing "funny" characters.  Such prior "cleaning"
# of the text makes large corpora more manageable; we'll do some more cleaning in R below.

# First, we're not interested in function words except for prepositions for this DSM
# (and we don't expect them to be useful as features), so we will now replace all
# function word tokens by "*" entries.
# For this purpose, we have to find all items in potter.tokens that end in the string "_FUNC";
# this is conveniently achieved with regular expressions and the grep function.
idx <- grep("_FUNC$", potter.tokens)

# Now idx (short for "index") contains the indices of all such items in the vector potter.tokens
length(idx) # almost 1.5 million function words to be deleted

# We can use idx as a subscript to potter.tokens, which allows us to assign "*" to all these entries
potter.tokens[idx] <- "*"

# Compare against the first version above: many more "*" now and the text becomes difficult to decipher
cat(paste(potter.tokens[1:200], sep=" "), "\n")

##
## 2) Types & frequency information
##

# The next step is to compile a list of word types (i.e. distinct words), which will
# become the rows and columns of our term-term matrix.  This can be done in different
# ways; here we use the table() function to obtain frequency counts for all types.
type.freqs <- table(potter.tokens)

# Number of types, most frequent types, number of hapaxes (f=1) and dis legomena (f=2)
length(type.freqs)   # number of distinct types
head(sort(type.freqs, decreasing = TRUE), 50) # 50 most frequent types
sum(type.freqs == 1) # number of hapax legomena (sum() over Boolean vector = number of TRUE items)
sum(type.freqs == 2) # number of dis legomena

# We certainly want to remove the hapax and dis legomena because they will not provide any
# useful distributional information.  But there are still more than 20,000 types left, which
# exceeds the problem sizes that 32-bit versions of R (which most people use) can handle.
# For this reason, we set a much higher frequency threshold, e.g. f >= 100
threshold <- 100  # leaves about 2,500 types; if you're adventurous, you can also try f >= 50

# Identify the "rare" words, i.e. types that fall below the frequency threshold.  This is a 
# bit tricky, because type.freqs is an integer vector of frequency counts _labelled_ with the
# strings.  We can access these labels with the names() function:
rare.types <- (names(type.freqs))[type.freqs < threshold]
length(rare.types) # number of "rare" words (which will be discarded in the next step)

# Now we want to find all tokens listed in rare.types and replace them by "*".  In Python or Perl,
# this is fairly simple: store rare.types as keys in a hash ("dictionary"), then loop through 
# potter.tokens and check for each token whether the string is contained in the hash.
# In R, however ... this task is even easier, thanks to the %in% operator:
idx <- potter.tokens %in% rare.types # TRUE for every token that is listed in rare.types
potter.tokens[idx] <- "*"

# If you look carefully at the text, a few more words (like "Bingblot" and "unrequited") are gone now
cat(paste(potter.tokens[1:200], sep=" "), "\n")

# Now it's time to rebuild our list of types and type frequencies; then we can use the standard
# cross-tabulation with table() to generate a term-term co-occurrence matrix.  However, the
# resulting type list and co-occurrence matrix would include "*" as an extremely frequent word.
# In order to remove this spurious entry and reduce processing load in the cross-tabulation, we
# want to tell R to ignore all "*" items in the token list, i.e. replace them by "undefined" values
# (in R, an undefined value is written as NA, for "not available" or "n/a").
potter.tokens[potter.tokens == "*"] <- NA # note that NA is a constant like TRUE or FALSE, not a string "NA"

# There's a little catch here: the vocabulary still contains a "boilerplate" term "None_NAME"
# that always appears in the headers of stories ("Spoilers: None" etc.).  After our cleanup
# procedure, this term no longer has any co-occurrences left (all marked "*"), which leads
# to problems when we try to normalise the rows of our term-term matrix to a length of 1.
# For this reason, we will now delete the offending term.  A more general solution would be
# to examine the generated co-occurrence matrix M below for terms with few co-occurrences, and
# then remove them from the matrix (while not forgetting to update the type list etc. accordingly)
potter.tokens[potter.tokens == "None_NAME"] <- NA

# When we call table() to build a frequency list, R actually compiles a list of all types first and
# then tansforms the character vector into a "factor", i.e. an indexed categorical variable with
# well-defined levels (corresponding to the distinct word types).  This means that (a) we had it all
# backwards when we extracted the list of types from the frequency table produced by table(), and
# (b) that R has to re-compile the factor for every tabulation and cross-tabulation we perform.
# In order to make processing more efficient, we will now convert the "cleaned" token vector explicitly
# into a factor.  If we had done this earlier, R would have kept the original type list as the levels
# of the factor, even when we remove the low-frequency types from the vector.
potter.tokens <- factor(potter.tokens) # by default, levels will be word types in alphabetical order
potter.types <- levels(potter.tokens)  # extract the pre-compiled type list from the factor object
type.freqs <- table(potter.tokens)     # we will need the updated frequency information later
V <- length(potter.types)              # V is the standard notation for vocabulary size (number of types)
head(potter.tokens, 200)  # still essentially the same ("Levels" line at the bottom reveals it is a factor)

# In your experiments later on, you will often want to select words from a certain part of speech,
# or pick a lemma regardless of whether it is a noun or verb, etc.  For this purpose, let us construct
# a data.frame (the standard R data structure for tabular data) that contains the following information:
#   idx   = row/column index of the word type in the co-occurrence matrix M (see below)
#   lemma = lemma (without part of speech)
#   pos   = part of speech code (e.g. NOUN)
#   f     = frequency of this word type
# Unlike e.g. Perl, R wasn't designed for string manipulation, so things become a bit tricky at
# this point (that's why we're setting up a data.frame that you can access more easily).
# It's often easiest to compute such tables outside R and load them from a text file.
split.types <- do.call(rbind, strsplit(potter.types, "_", fixed=TRUE))
type.info <- data.frame(idx=1:V, lemma=split.types[,1], pos=split.types[,2], f=as.double(type.freqs))
head(type.info, 25)

# NB: If you want to calculate DSM distances for nouns only, or check how many of the nearest
# neighbours are adjectives, you can use conditions on type.info to subset the matrix M,
# or use indices from M to access the relevant information about the terms in type.info

##
## 3) Window-based co-occurrence counts
##

# Our goal is to build a term-term co-occurrence matrix for a (L3, R3) window, i.e. each entry
# records how often the corresponding row word and column word type occur within 3 words of each other.
# R has no built-in support for such an "extended" cross-tabulation, so we need to compute the
# term-term matrix in multiple steps.  Below, we spell out this procedure for our 3-word window.  If
# you want to get some practice in R programming, you could try and implement a larger window size.
# You should probably rewrite the "manual" steps below into an automatic loop that works for arbitrary
# window size (specified by a "parameter" variable).

# For this part, we use the "node" / "collocate" terminology of Sinclair (1991); i.e., each row of the
# matrix represents a node word, and the entries in this row a co-occurrence frequencies for all its
# collocates within the specified window (here L3, R3).

# First, let us build a vector of the node words.  Note that we cut off the start and end of potter.tokens
# where the (L3, R3) window does not fit completely in the corpus.
N <- length(potter.tokens) # corpus size N
idx <- 4:(N-4) # = (4, 5, 6, ..., N-5, N-4); NB: don't forget the parentheses around N-4!
node.tokens <- potter.tokens[idx] # these are all valid node positions in the corpus

# It is easy to build a co-occurrence matrix for a specific distance between node and collocate,
# e.g. if the collocate is the second word after the node (but not the first, and not before the node).
# To do this, we construct a suitable shifted (or "lagged") token vector for the collocates:
collocate.tokens <- potter.tokens[idx + 2] # resulting subscript is the vector (6, 7, 8, ..., N-3, N-2)

# For every node token in node.tokens, the corresponding entry of collocate.tokens contains the second
# word after this node, i.e. the collocate token at position +2.  Confused?  Let us print the two
# vectors side by side (as aligned rows) to get a better idea of what they look like.  Can you see how
# these vectors pair every node token with the second word after it?
data.frame(node=node.tokens[1:25], collocate=collocate.tokens[1:25])

# A cross-tabulation of node.tokens against collocate.tokens produces a term-term co-occurrence
# matrix for the single collocate position +2
M <- table(node.tokens, collocate.tokens)

# Now we just need to repeat this process for every collocate position in the window, and add up
# the resulting co-occurrence matrices.  In order to save memory, we add one position at a time.
collocate.tokens <- potter.tokens[idx + 3] # offset: +3
M <- M + table(node.tokens, collocate.tokens)
collocate.tokens <- potter.tokens[idx + 1] # offset: +1
M <- M + table(node.tokens, collocate.tokens)
collocate.tokens <- potter.tokens[idx - 1] # offset: -1
M <- M + table(node.tokens, collocate.tokens)
collocate.tokens <- potter.tokens[idx - 2] # offset: -2
M <- M + table(node.tokens, collocate.tokens)
collocate.tokens <- potter.tokens[idx - 3] # offset: -3
M <- M + table(node.tokens, collocate.tokens)

# As a plausibility test, let us check the frequencies of the most frequent words
idx <- order(type.freqs, decreasing=TRUE)[1:20] # row/column indices of 20 most frequent word types
M[idx, idx] # co-occurrences between these frequent words

# M and type.freqs are integer vectors, which can lead to overflow errors if we use them in
# arithmetic operations later on.  So now is a good time to convert them to floating-point (double):
mode(type.freqs) <- "double"  # NB: as.double() would discard labels and other information
mode(M) <- "double"

##
## 4) Feature scaling and normalisation
##

# We decide to use log frequencies as feature and normalise rows according to the Euclidean metric.
# You can and should try to experiment with different feature scaling and normalisation options.
# If you compute association measures, use N (= 5 million) as the sample size and multiply the
# expected frequency with the size of the co-occurrence window (a factor of 6 in our case); see
# Evert (2008) for an explanation of this scaling factor.

# Compiling the matrix M was fairly time-consuming and computationally expensive, so it's a good
# idea to make a backup copy (in case we make a mistake, or to try a different feature scaling)
M.orig <- M

# Now scale to sparse log frequencies, using the formula f' = log10(f + 1)
M <- log10(M.orig + 1)

##
## Now it's your turn: normalise the rows of M as you've learned in the lecture (or apply other
## scaling and normalisation methods of your choice), then calculate distances between the rows,
## find nearest neighbours, visualise some coordinates, plot a semantic map, etc.
##
