## ## Build window-based term-term DSM from 5 M word corpus entirely in R ## ## ## 1) Setup: load & preprocess data ## # Our corpus is a 5 M word collection of Harry Potter fan fiction mined from the Web. # It has been lemmatised and part-of-speech tagged with TreeTagger (standard parameter file). # "potter_tokens.txt.gz" contains this corpus in one-word-per-line format, aka. "verticalised text"; # each line corresponds to a single token given in the form _, e.g. "mouse_NOUNT" for "mice". # Read the compressed file into a vector of lines (a vector of mode "character" in R). potter.tokens <- readLines(gzfile("potter_tokens.txt.gz")) # NB: it is possible that your Web browser automatically decompressed the file after the download # (e.g., Safari on Mac OS X has an unfortunate tendency to do this). In this case, you have to omit # the gzfile(...) function from the call: # potter.tokens <- readLines("potter_tokens.txt") # Now check that we have all 5 million tokens and take a first look at the corpus: length(potter.tokens) # should be 5,000,010 head(potter.tokens, 200) # If you want to try and read it is text, you have to concatenate the tokens with paste() cat(paste(potter.tokens[1:200], sep=" "), "\n") # You'll notice lots of tokens that are simply given as "*" in the text. These include all # punctuation symbols, numbers and words containing "funny" characters. Such prior "cleaning" # of the text makes large corpora more manageable; we'll do some more cleaning in R below. # First, we're not interested in function words except for prepositions for this DSM # (and we don't expect them to be useful as features), so we will now replace all # function word tokens by "*" entries. # For this purpose, we have to find all items in potter.tokens that end in the string "_FUNC"; # this is conveniently achieved with regular expressions and the grep function. idx <- grep("_FUNC$", potter.tokens) # Now idx (short for "index") contains the indices of all such items in the vector potter.tokens length(idx) # almost 1.5 million function words to be deleted # We can use idx as a subscript to potter.tokens, which allows us to assign "*" to all these entries potter.tokens[idx] <- "*" # Compare against the first version above: many more "*" now and the text becomes difficult to decipher cat(paste(potter.tokens[1:200], sep=" "), "\n") ## ## 2) Types & frequency information ## # The next step is to compile a list of word types (i.e. distinct words), which will # become the rows and columns of our term-term matrix. This can be done in different # ways; here we use the table() function to obtain frequency counts for all types. type.freqs <- table(potter.tokens) # Number of types, most frequent types, number of hapaxes (f=1) and dis legomena (f=2) length(type.freqs) # number of distinct types head(sort(type.freqs, decreasing = TRUE), 50) # 50 most frequent types sum(type.freqs == 1) # number of hapax legomena (sum() over Boolean vector = number of TRUE items) sum(type.freqs == 2) # number of dis legomena # We certainly want to remove the hapax and dis legomena because they will not provide any # useful distributional information. But there are still more than 20,000 types left, which # exceeds the problem sizes that 32-bit versions of R (which most people use) can handle. # For this reason, we set a much higher frequency threshold, e.g. f >= 100 threshold <- 100 # leaves about 2,500 types; if you're adventurous, you can also try f >= 50 # Identify the "rare" words, i.e. types that fall below the frequency threshold. This is a # bit tricky, because type.freqs is an integer vector of frequency counts _labelled_ with the # strings. We can access these labels with the names() function: rare.types <- (names(type.freqs))[type.freqs < threshold] length(rare.types) # number of "rare" words (which will be discarded in the next step) # Now we want to find all tokens listed in rare.types and replace them by "*". In Python or Perl, # this is fairly simple: store rare.types as keys in a hash ("dictionary"), then loop through # potter.tokens and check for each token whether the string is contained in the hash. # In R, however ... this task is even easier, thanks to the %in% operator: idx <- potter.tokens %in% rare.types # TRUE for every token that is listed in rare.types potter.tokens[idx] <- "*" # If you look carefully at the text, a few more words (like "Bingblot" and "unrequited") are gone now cat(paste(potter.tokens[1:200], sep=" "), "\n") # Now it's time to rebuild our list of types and type frequencies; then we can use the standard # cross-tabulation with table() to generate a term-term co-occurrence matrix. However, the # resulting type list and co-occurrence matrix would include "*" as an extremely frequent word. # In order to remove this spurious entry and reduce processing load in the cross-tabulation, we # want to tell R to ignore all "*" items in the token list, i.e. replace them by "undefined" values # (in R, an undefined value is written as NA, for "not available" or "n/a"). potter.tokens[potter.tokens == "*"] <- NA # note that NA is a constant like TRUE or FALSE, not a string "NA" # There's a little catch here: the vocabulary still contains a "boilerplate" term "None_NAME" # that always appears in the headers of stories ("Spoilers: None" etc.). After our cleanup # procedure, this term no longer has any co-occurrences left (all marked "*"), which leads # to problems when we try to normalise the rows of our term-term matrix to a length of 1. # For this reason, we will now delete the offending term. A more general solution would be # to examine the generated co-occurrence matrix M below for terms with few co-occurrences, and # then remove them from the matrix (while not forgetting to update the type list etc. accordingly) potter.tokens[potter.tokens == "None_NAME"] <- NA # When we call table() to build a frequency list, R actually compiles a list of all types first and # then tansforms the character vector into a "factor", i.e. an indexed categorical variable with # well-defined levels (corresponding to the distinct word types). This means that (a) we had it all # backwards when we extracted the list of types from the frequency table produced by table(), and # (b) that R has to re-compile the factor for every tabulation and cross-tabulation we perform. # In order to make processing more efficient, we will now convert the "cleaned" token vector explicitly # into a factor. If we had done this earlier, R would have kept the original type list as the levels # of the factor, even when we remove the low-frequency types from the vector. potter.tokens <- factor(potter.tokens) # by default, levels will be word types in alphabetical order potter.types <- levels(potter.tokens) # extract the pre-compiled type list from the factor object type.freqs <- table(potter.tokens) # we will need the updated frequency information later V <- length(potter.types) # V is the standard notation for vocabulary size (number of types) head(potter.tokens, 200) # still essentially the same ("Levels" line at the bottom reveals it is a factor) # In your experiments later on, you will often want to select words from a certain part of speech, # or pick a lemma regardless of whether it is a noun or verb, etc. For this purpose, let us construct # a data.frame (the standard R data structure for tabular data) that contains the following information: # idx = row/column index of the word type in the co-occurrence matrix M (see below) # lemma = lemma (without part of speech) # pos = part of speech code (e.g. NOUN) # f = frequency of this word type # Unlike e.g. Perl, R wasn't designed for string manipulation, so things become a bit tricky at # this point (that's why we're setting up a data.frame that you can access more easily). # It's often easiest to compute such tables outside R and load them from a text file. split.types <- do.call(rbind, strsplit(potter.types, "_", fixed=TRUE)) type.info <- data.frame(idx=1:V, lemma=split.types[,1], pos=split.types[,2], f=as.double(type.freqs)) head(type.info, 25) # NB: If you want to calculate DSM distances for nouns only, or check how many of the nearest # neighbours are adjectives, you can use conditions on type.info to subset the matrix M, # or use indices from M to access the relevant information about the terms in type.info ## ## 3) Window-based co-occurrence counts ## # Our goal is to build a term-term co-occurrence matrix for a (L3, R3) window, i.e. each entry # records how often the corresponding row word and column word type occur within 3 words of each other. # R has no built-in support for such an "extended" cross-tabulation, so we need to compute the # term-term matrix in multiple steps. Below, we spell out this procedure for our 3-word window. If # you want to get some practice in R programming, you could try and implement a larger window size. # You should probably rewrite the "manual" steps below into an automatic loop that works for arbitrary # window size (specified by a "parameter" variable). # For this part, we use the "node" / "collocate" terminology of Sinclair (1991); i.e., each row of the # matrix represents a node word, and the entries in this row a co-occurrence frequencies for all its # collocates within the specified window (here L3, R3). # First, let us build a vector of the node words. Note that we cut off the start and end of potter.tokens # where the (L3, R3) window does not fit completely in the corpus. N <- length(potter.tokens) # corpus size N idx <- 4:(N-4) # = (4, 5, 6, ..., N-5, N-4); NB: don't forget the parentheses around N-4! node.tokens <- potter.tokens[idx] # these are all valid node positions in the corpus # It is easy to build a co-occurrence matrix for a specific distance between node and collocate, # e.g. if the collocate is the second word after the node (but not the first, and not before the node). # To do this, we construct a suitable shifted (or "lagged") token vector for the collocates: collocate.tokens <- potter.tokens[idx + 2] # resulting subscript is the vector (6, 7, 8, ..., N-3, N-2) # For every node token in node.tokens, the corresponding entry of collocate.tokens contains the second # word after this node, i.e. the collocate token at position +2. Confused? Let us print the two # vectors side by side (as aligned rows) to get a better idea of what they look like. Can you see how # these vectors pair every node token with the second word after it? data.frame(node=node.tokens[1:25], collocate=collocate.tokens[1:25]) # A cross-tabulation of node.tokens against collocate.tokens produces a term-term co-occurrence # matrix for the single collocate position +2 M <- table(node.tokens, collocate.tokens) # Now we just need to repeat this process for every collocate position in the window, and add up # the resulting co-occurrence matrices. In order to save memory, we add one position at a time. collocate.tokens <- potter.tokens[idx + 3] # offset: +3 M <- M + table(node.tokens, collocate.tokens) collocate.tokens <- potter.tokens[idx + 1] # offset: +1 M <- M + table(node.tokens, collocate.tokens) collocate.tokens <- potter.tokens[idx - 1] # offset: -1 M <- M + table(node.tokens, collocate.tokens) collocate.tokens <- potter.tokens[idx - 2] # offset: -2 M <- M + table(node.tokens, collocate.tokens) collocate.tokens <- potter.tokens[idx - 3] # offset: -3 M <- M + table(node.tokens, collocate.tokens) # As a plausibility test, let us check the frequencies of the most frequent words idx <- order(type.freqs, decreasing=TRUE)[1:20] # row/column indices of 20 most frequent word types M[idx, idx] # co-occurrences between these frequent words # M and type.freqs are integer vectors, which can lead to overflow errors if we use them in # arithmetic operations later on. So now is a good time to convert them to floating-point (double): mode(type.freqs) <- "double" # NB: as.double() would discard labels and other information mode(M) <- "double" ## ## 4) Feature scaling and normalisation ## # We decide to use log frequencies as feature and normalise rows according to the Euclidean metric. # You can and should try to experiment with different feature scaling and normalisation options. # If you compute association measures, use N (= 5 million) as the sample size and multiply the # expected frequency with the size of the co-occurrence window (a factor of 6 in our case); see # Evert (2008) for an explanation of this scaling factor. # Compiling the matrix M was fairly time-consuming and computationally expensive, so it's a good # idea to make a backup copy (in case we make a mistake, or to try a different feature scaling) M.orig <- M # Now scale to sparse log frequencies, using the formula f' = log10(f + 1) M <- log10(M.orig + 1) ## ## Now it's your turn: normalise the rows of M as you've learned in the lecture (or apply other ## scaling and normalisation methods of your choice), then calculate distances between the rows, ## find nearest neighbours, visualise some coordinates, plot a semantic map, etc. ##