Please note that this book is a work in progress, the equivalent of a pre-alpha release. All the code should work, because if it didn’t the site could not be built. But there is still a lot of work to do to explain the historical methods under discussion. Feel free to leave feedback as issues on the GitHub repository, or to e-mail me.

Install mallet, a smaller version of the MALLET library: install.packages("mallet")

library(mallet)
## Loading required package: rJava
library(tm)
## Loading required package: NLP
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read in Tracts for the Times:

tracts <- mallet.read.dir("data/tracts-for-the-times/")

Create stopwords:

stops <- stopwords("english")
stops_file <- file("data/stopwords.txt")
writeLines(stops, stops_file)
close(stops_file)

Create mallet instances

inst <- mallet.import(tracts$id, tracts$text, "data/stopwords.txt")

Create a topic modeler and load docs

topic_model <- MalletLDA(30)
topic_model$loadDocuments(inst)

What do we have?

# topic_model$getVocabulary()[1:100]
# topic_model$getDocumentNames()
freq <- mallet.word.freqs(topic_model)
freq %>%
  arrange(-term.freq) %>%
  top_n(20)
## Selecting by doc.freq
##        words term.freq doc.freq
## 1        god      3831       80
## 2     church      3606       79
## 3          s      3241       79
## 4     christ      2924       76
## 5         us      2676       79
## 6        may      2576       79
## 7        one      2359       78
## 8       will      1994       77
## 9       lord      1734       77
## 10      upon      1665       76
## 11        st      1407       75
## 12     shall      1281       76
## 13      made      1057       75
## 14       men      1011       76
## 15     great      1008       75
## 16       man       974       78
## 17       can       967       75
## 18      time       876       76
## 19      even       826       75
## 20 christian       761       75
## 21     times       496       79
## 22    number       255       80
freq %>%
  arrange(-doc.freq) %>%
  top_n(20)
## Selecting by doc.freq
##        words term.freq doc.freq
## 1     number       255       80
## 2        god      3831       80
## 3      times       496       79
## 4     church      3606       79
## 5        may      2576       79
## 6         us      2676       79
## 7          s      3241       79
## 8        one      2359       78
## 9        man       974       78
## 10      will      1994       77
## 11      lord      1734       77
## 12      upon      1665       76
## 13    christ      2924       76
## 14       men      1011       76
## 15      time       876       76
## 16     shall      1281       76
## 17        st      1407       75
## 18       can       967       75
## 19      made      1057       75
## 20     great      1008       75
## 21 christian       761       75
## 22      even       826       75

Now to do the topic generation

topic_model$train(500)
doc_topics <- mallet.doc.topics(topic_model, smoothed=T, normalized=T)
topic_words <- mallet.topic.words(topic_model, smoothed=T, normalized=T)
topic_docs <- t(doc_topics)
mallet.top.words(topic_model, topic_words[4,], num.top.words = 20)
##        words     weights
## 1     christ 0.052891346
## 2       body 0.050358324
## 3      bread 0.030094147
## 4      blood 0.028675655
## 5  sacrament 0.016415828
## 6       wine 0.014896014
## 7  substance 0.013072238
## 8      given 0.010944500
## 9     things 0.010032612
## 10     flesh 0.009829970
## 11      said 0.009728649
## 12   fathers 0.009627328
## 13    manner 0.009627328
## 14     words 0.008715440
## 15      doth 0.008310157
## 16  presence 0.008006194
## 17   without 0.007904873
## 18        st 0.007904873
## 19   natural 0.007904873
## 20 spiritual 0.007600911
topics <- mallet.topic.labels(topic_model, topic_words, num.top.words = 100)

topic_docs <- topic_docs %>%
  as.data.frame() 

names(topic_docs) <- tracts$id
clust <- hclust(dist(topic_words))
plot(clust)