Hands-on Tutorials (Day Two)

Topic modeling

Install the required packages for this tutorial.

# Package names
packages <- c(

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
ukimmig2010 <- data.frame(text = quanteda::data_char_ukimmig2010)
ukimmig2010$party <- names(quanteda::data_char_ukimmig2010)

ukimmig2010 <- ukimmig2010 %>%
  mutate(lemma = tolower(text)) %>%
  mutate(lemma = lemmatize_strings(lemma))

ukimmig2010_corpus <- quanteda::corpus(ukimmig2010,
                                       docid_field = "party",
                                       text_field = "lemma")

ukimmig2010_dfm <- ukimmig2010_corpus %>%
  quanteda::tokens(remove_punct = TRUE,
         remove_symbols = TRUE,
         remove_numbers = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(stopwords("en")) %>%
  dfm() %>%
  dfm_trim(min_docfreq = 0.5, max_docfreq = 0.99, 
           docfreq_type = "prop") %>%
  dfm_subset(ntoken(.) > 0)
ukimmig2010_dtm <- convert(ukimmig2010_dfm, to = "topicmodels")

The basic function to fit a topic model is LDA:

topicModel <- LDA(ukimmig2010_dtm, 
                  k = 5, 

topicmodels::terms(topicModel, 5)
     Topic 1   Topic 2  Topic 3      Topic 4       Topic 5  
[1,] "control" "new"    "uk"         "people"      "asylum" 
[2,] "year"    "border" "british"    "system"      "right"  
[3,] "work"    "must"   "immigrant"  "citizenship" "country"
[4,] "eu"      "can"    "national"   "end"         "much"   
[5,] "student" "live"   "government" "refugee"     "illegal"

Considering the output of the function LDA, the beta matrix includes the information about the distribution of terms by topics.

tidy_model <- tidy(topicModel)

top_terms <- tidy_model %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta)) +
  geom_bar(stat = "identity") +
  scale_x_reordered() +
  facet_wrap(~ topic, scales = "free_x") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 

Information about the distribution of topics in each documents is in the matrix gamma.

tidy_model_gamma <- tidy(topicModel, matrix = "gamma")
# A tibble: 6 × 3
  document     topic gamma
  <chr>        <int> <dbl>
1 BNP              1 0.167
2 Coalition        1 0.258
3 Conservative     1 0.305
4 Greens           1 0.215
5 Labour           1 0.243
6 LibDem           1 0.156
tidy_model_gamma <- tidy_model_gamma %>%
  mutate(document = 
           mgsub::mgsub(string = document,
                        pattern = unique(tidy_model_gamma$document),
                        replacement = names(data_char_ukimmig2010),
                        fixed = TRUE))

ggplot(tidy_model_gamma) +
  geom_col(aes(x = topic, y = gamma)) +
  facet_wrap(~ document, nrow = 3)

You may want to assign the most prevalent topic to each document in the corpus.

docvars(ukimmig2010_corpus, "pred_topic") <- topicmodels::topics(topicModel)
'data.frame':   9 obs. of  2 variables:
 $ text      : chr  "IMMIGRATION: AN UNPARALLELED CRISIS WHICH ONLY THE BNP CAN SOLVE. \n\n- At current immigration and birth rates,"| __truncated__ "IMMIGRATION. \n\nThe Government believes that immigration has enriched our culture and strengthened our economy"| __truncated__ "Attract the brightest and best to our country.\n\nImmigration has enriched our nation over the years and we wan"| __truncated__ "Immigration.\n\nMigration is a fact of life.  People have always moved from one country to another, and as a pr"| __truncated__ ...
 $ pred_topic: int  3 2 1 1 1 4 5 5 1

Advanced topic modeling methods

Identify the number of topics

There are different algorithms for estimating the optimal number of topics. The ldatuning package provides a function FindTopicsNumber that calculates different metrics to estimate the most preferable number of topics for LDA model.

tn <- FindTopicsNumber(ukimmig2010_dtm, 
                       topics = seq(5, 50, by = 5),
                       metrics = c("Griffiths2004", 
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the ldatuning package.
  Please report the issue at <]8;;https://github.com/nikita-moor/ldatuning/issueshttps://github.com/nikita-moor/ldatuning/issues]8;;>.

Here, the Griffiths2004 approach is actually based on the log-likelihood maximization and it is described in the related paper1. It is also the default approach of ldatuning. The Deveaud2014 is also a common choice.

       aes(x = topics, y = Griffiths2004)) +
  geom_point() +
  geom_line() + 

Based on this indication, we can peraphs fit a model with about 15 topics.

topicModel15 <- LDA(ukimmig2010_dtm, 
                  k = 15, 

topicmodels::terms(topicModel15, 5)
     Topic 1  Topic 2       Topic 3     Topic 4   Topic 5      Topic 6 
[1,] "much"   "people"      "limit"     "seeker"  "government" "asylum"
[2,] "uk"     "make"        "can"       "system"  "must"       "right" 
[3,] "eu"     "country"     "eu"        "part"    "child"      "live"  
[4,] "can"    "ensure"      "immigrant" "allow"   "national"   "high"  
[5,] "arrive" "citizenship" "give"      "control" "immigrant"  "allow" 
     Topic 7   Topic 8   Topic 9       Topic 10  Topic 11   Topic 12   
[1,] "end"     "uk"      "migrant"     "system"  "country"  "british"  
[2,] "control" "asylum"  "citizenship" "house"   "year"     "illegal"  
[3,] "support" "control" "detention"   "work"    "national" "immigrant"
[4,] "new"     "citizen" "non"         "uk"      "work"     "right"    
[5,] "people"  "year"    "control"     "british" "apply"    "country"  
     Topic 13  Topic 14  Topic 15 
[1,] "student" "border"  "work"   
[2,] "border"  "point"   "economy"
[3,] "need"    "benefit" "make"   
[4,] "police"  "new"     "future" 
[5,] "ensure"  "take"    "people" 
tidy_model_gamma_15 <- tidy(topicModel15, matrix = "gamma")
tidy_model_gamma_15 <- tidy_model_gamma_15 %>%
  mutate(document = 
           mgsub::mgsub(string = document,
                        pattern = unique(tidy_model_gamma_15$document),
                        replacement = names(data_char_ukimmig2010),
                        fixed = TRUE))

ggplot(tidy_model_gamma_15) +
  geom_col(aes(x = topic, y = gamma)) +
  facet_wrap(~ document, nrow = 3)

Coherence and exclusivity

The package topicdoc provides diagnostic measures for topic models. They can be used to compare different models. Usually, models with a different number of topics are being compared.

topicModel_diag <- topic_diagnostics(topicModel, ukimmig2010_dtm)

A particularly useful and commonly-used metrics are semantic coherence and exclusivity. A good topic model should have coherent topics (i.e., about a single theme and not a mixture of different themes), which also are well distinguishable from each other, without overlaps (exclusivity).

topicModel_diag %>%
  mutate(topic = as_factor(topic_num)) %>%
  ggplot() +
  geom_point(aes(x = topic_coherence, y = topic_exclusivity, color = topic),
             size = 3) +
  ylab(label = "Semantic Coherence") +
  xlab("Exclusivity") +
  ggtitle("A topic model with 5 topics")

Held-out likelihood (perplexity)

Perplexity is a metric for the accuracy of a probability model in predicting a sample and can be used as a measure of a topic model’s ability to predict new data. The lower the perplexity, the better the model.

Topic models with different number of topics can be compared based on perplexity using cross-validation. This involves dividing data into subsets (usually 5), and using one subset as the validation set while using the remaining as the training set. This ensures that each data point has an equal opportunity of being part of the validation and training sets.

This method is useful in evaluating the overall performance of the model on unseen data and in determining optimal values for tuning the number of topics.

cluster <- makeCluster(detectCores(logical = TRUE) - 1) 

clusterEvalQ(cluster, {
[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"      
[6] "datasets"    "methods"     "base"       

[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"      
[6] "datasets"    "methods"     "base"       

[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"      
[6] "datasets"    "methods"     "base"       

[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"      
[6] "datasets"    "methods"     "base"       

[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"      
[6] "datasets"    "methods"     "base"       

[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"      
[6] "datasets"    "methods"     "base"       

[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"      
[6] "datasets"    "methods"     "base"       
burnin <- 1000
iter <- 1000
keep <- 50

full_data <- ukimmig2010_dtm
n <- nrow(full_data)
folds <- 5
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2, 5, 7, 10, 15, 20, 50, 100)

clusterExport(cluster, c("full_data", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))

# we parallelize by the different number of topics.  
# A processor is allocated a value of k, and does the cross-validation serially.  This is because it is assumed there are more candidate values of k than there are cross-validation folds, hence it will be more efficient to parallelise
results <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
   k <- candidate_k[j]
   results_1k <- matrix(0, nrow = folds, ncol = 2)
   colnames(results_1k) <- c("k", "perplexity")
   for(i in 1:folds){
      train_set <- full_data[splitfolds != i , ]
      valid_set <- full_data[splitfolds == i, ]
      fitted <- LDA(train_set, k = k, method = "Gibbs",
                    control = list(burnin = burnin, iter = iter, keep = keep) )
      results_1k[i,] <- c(k, perplexity(fitted, newdata = valid_set))
   user  system elapsed 
  0.018   0.001   3.589 

results_df <- as.data.frame(results)

ggplot(results_df, aes(x = k, y = perplexity)) +
   geom_point() +
   geom_smooth(se = FALSE) +
   ggtitle("5-fold cross-validation") +
   labs(x = "Candidate number of topics", 
        y = "Perplexity when fitting the trained model to the hold-out set")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Structural Topic Models

usa_inaugural_df <- read.csv(file = "data/usa_inaugural_df.csv")
# load the udpipe English model
udpipe_english_model <- udpipe_load_model(file = "./english-ewt-ud-2.5-191206.udpipe")

# annotate the text
usa_inaugural_udpipe <- udpipe_annotate(udpipe_english_model, 
                                  x = usa_inaugural_df$text, 
                                  tagger = "default", 
                                  parser = "none") %>%
inaug_dfm <- usa_inaugural_df %>%
  corpus() %>%
  quanteda::tokens(remove_punct = TRUE,
         remove_symbols = TRUE,
         remove_numbers = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(stopwords("en")) %>%
  # lemmatization
    pattern = usa_inaugural_udpipe$token, 
    replacement = usa_inaugural_udpipe$lemma,
    valuetype = "fixed") %>%
    dfm() %>%
    dfm_trim(min_docfreq = 0.5, max_docfreq = 0.99, 
           docfreq_type = "prop") %>%
  dfm_subset(ntoken(.) > 0)
inaug_stm_dfm <- convert(inaug_dfm, to = "stm")
out <- prepDocuments(inaug_stm_dfm$documents, 

Determine the number of topics

There is not a “right” answer to the number of topics that are appropriate for a given corpus, but the function searchK uses a data-driven approach to selecting the number of topics. The function will perform several automated tests to help choose the number of topics.

k_search <- searchK(out$documents, out$vocab, K = c(5, 10, 15, 20),
                    prevalence = ~s(Year) + Party, 
                    data = out$meta, init.type = "Spectral")
Beginning Spectral Initialization 
     Calculating the gram matrix...
     Finding anchor words...
     Recovering initialization...
Initialization complete.
Beginning Spectral Initialization 
     Calculating the gram matrix...
     Finding anchor words...
     Recovering initialization...
Initialization complete.
Beginning Spectral Initialization 
     Calculating the gram matrix...
     Finding anchor words...
     Recovering initialization...
Initialization complete.
   K   exclus    semcoh   heldout residual     bound    lbound em.its
1  5 8.782945 -7.944008 -4.960393 1.624187   -117078 -117073.2     48
2 10 9.285102 -11.98722 -4.967642 1.532483 -116254.2 -116239.1     73
3 15  9.29123 -11.69239 -5.034053 1.615663 -115704.1 -115676.2     50
4 20 9.328248 -12.40123  -5.00695 1.760186 -115273.8 -115231.5     61

searchK(documents = out$documents, vocab = out$vocab, K = c(5, 
    10, 15, 20), init.type = "Spectral", prevalence = ~s(Year) + 
    Party, data = out$meta)

[1] "searchK"
k_search$results %>%
  select(K, exclus, semcoh) %>%
  mutate(K = unlist(K),
         exclus = unlist(exclus),
         semcoh = unlist(semcoh)) %>%
  mutate(K = as_factor(K)) %>%
  ggplot() +
  geom_point(aes(x = exclus, y = semcoh, color = K), size = 5) +
  ggtitle("Semantic Coherence vs Exclusivity") +
  ylab("Semantic Coherence") +

If init.type="Spectral" you can also set K=0 to use the algorithm of Lee and Mimno (2014) to set the number of topics

inaug_stm_fit <- stm(documents = out$documents, 
                     vocab = out$vocab,
                     data = out$meta,
                     K = 9, 
                     prevalence = ~s(Year) + Party, 
                     init.type = "Spectral")
Estimate effects

It is then possible to analyze the results. In this case, by checking the variation in topic prevalence over time, and by party.

estimate_inaug <- estimateEffect(1:9 ~ s(Year) + Party,
                                 meta = out$meta,
                                 uncertainty = "Global")
# summary(year_topic, topics = 1)
for(i in 1:9){
  covariate = "Year",
  method = "continuous",
  topics = i,
  model = inaug_stm_fit,
  printlegend = FALSE,
  xlab = "Time")

par(mfrow = c(3, 3))
for(i in 1:9){
  covariate = "Party",
  method = "difference",
  cov.value1 = "Democratic",
  cov.value2 = "Republican",
  topics = i,
  model = inaug_stm_fit,
  printlegend = FALSE,
  xlab = "← Republican     Democratic →",
  main = "Democratic vs Republican (D-R)",
  xlim = c(-0.2, 0.2),
  cex = 0.5,
  labeltype = "custom",
  custom.labels = c("", "")


The result can be read as a regression model.


estimateEffect(formula = 1:9 ~ s(Year) + Party, stmobj = inaug_stm_fit, 
    metadata = out$meta, uncertainty = "Global")

Topic 1:

                             Estimate Std. Error t value Pr(>|t|)   
(Intercept)                -0.6560642  0.2999152  -2.187  0.03419 * 
s(Year)1                    0.9657122  0.3717516   2.598  0.01280 * 
s(Year)2                    0.6685440  0.3153818   2.120  0.03983 * 
s(Year)3                    0.7169954  0.3254208   2.203  0.03298 * 
s(Year)4                    1.0737266  0.3139968   3.420  0.00138 **
s(Year)5                    0.8424747  0.3212461   2.623  0.01203 * 
s(Year)6                    0.6791383  0.3041562   2.233  0.03082 * 
s(Year)7                    0.7288687  0.3114955   2.340  0.02400 * 
s(Year)8                    0.6185376  0.3128653   1.977  0.05448 . 
s(Year)9                    0.7332742  0.3126560   2.345  0.02370 * 
s(Year)10                   0.6863272  0.3074015   2.233  0.03083 * 
PartyDemocratic-Republican -0.0778787  0.0994000  -0.783  0.43763   
PartyFederalist             0.1195849  0.1499224   0.798  0.42946   
Partynone                   0.7672243  0.2803215   2.737  0.00898 **
PartyRepublican             0.0006405  0.0281950   0.023  0.98198   
PartyWhig                   0.0247918  0.0642640   0.386  0.70156   
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Topic 2:

                            Estimate Std. Error t value Pr(>|t|)  
(Intercept)                 0.159194   0.275049   0.579   0.5658  
s(Year)1                   -0.092249   0.310864  -0.297   0.7681  
s(Year)2                   -0.164481   0.273439  -0.602   0.5506  
s(Year)3                   -0.122783   0.312284  -0.393   0.6961  
s(Year)4                   -0.098119   0.281270  -0.349   0.7289  
s(Year)5                   -0.073466   0.290830  -0.253   0.8018  
s(Year)6                   -0.060987   0.282222  -0.216   0.8299  
s(Year)7                    0.057262   0.296491   0.193   0.8478  
s(Year)8                    0.594031   0.304308   1.952   0.0575 .
s(Year)9                    0.046384   0.301654   0.154   0.8785  
s(Year)10                   0.518333   0.287353   1.804   0.0783 .
PartyDemocratic-Republican -0.008223   0.100854  -0.082   0.9354  
PartyFederalist            -0.068284   0.162362  -0.421   0.6762  
Partynone                  -0.120181   0.235861  -0.510   0.6130  
PartyRepublican            -0.070150   0.036009  -1.948   0.0579 .
PartyWhig                  -0.025729   0.065478  -0.393   0.6963  
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Topic 3:

                           Estimate Std. Error t value Pr(>|t|)  
(Intercept)                 0.89739    0.50796   1.767   0.0844 .
s(Year)1                   -0.57591    0.58753  -0.980   0.3325  
s(Year)2                    0.07553    0.52506   0.144   0.8863  
s(Year)3                   -0.97702    0.56196  -1.739   0.0893 .
s(Year)4                   -0.44988    0.51076  -0.881   0.3833  
s(Year)5                   -0.78101    0.52481  -1.488   0.1440  
s(Year)6                   -0.80381    0.51485  -1.561   0.1258  
s(Year)7                   -0.88691    0.52150  -1.701   0.0962 .
s(Year)8                   -0.86330    0.52334  -1.650   0.1063  
s(Year)9                   -0.71904    0.52677  -1.365   0.1794  
s(Year)10                  -0.83511    0.52123  -1.602   0.1164  
PartyDemocratic-Republican -0.18159    0.18610  -0.976   0.3346  
PartyFederalist             0.01745    0.30671   0.057   0.9549  
Partynone                  -0.37717    0.41122  -0.917   0.3642  
PartyRepublican            -0.02725    0.04284  -0.636   0.5282  
PartyWhig                   0.08192    0.11245   0.728   0.4703  
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Topic 4:

                           Estimate Std. Error t value Pr(>|t|)  
(Intercept)                -0.41019    0.33566  -1.222   0.2283  
s(Year)1                    0.13105    0.38345   0.342   0.7342  
s(Year)2                    0.47780    0.33686   1.418   0.1633  
s(Year)3                    0.46778    0.37991   1.231   0.2249  
s(Year)4                    0.56383    0.34830   1.619   0.1128  
s(Year)5                    0.39631    0.35063   1.130   0.2646  
s(Year)6                    0.46536    0.34506   1.349   0.1845  
s(Year)7                    0.44604    0.35054   1.272   0.2101  
s(Year)8                    0.42641    0.35410   1.204   0.2351  
s(Year)9                    0.41181    0.35459   1.161   0.2519  
s(Year)10                   0.47667    0.34950   1.364   0.1797  
PartyDemocratic-Republican  0.29777    0.13150   2.264   0.0286 *
PartyFederalist             0.30649    0.20591   1.488   0.1439  
Partynone                   0.40160    0.28898   1.390   0.1718  
PartyRepublican             0.02766    0.03288   0.841   0.4048  
PartyWhig                  -0.04556    0.08050  -0.566   0.5743  
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Topic 5:

                             Estimate Std. Error t value Pr(>|t|)
(Intercept)                 0.3374745  0.4975994   0.678    0.501
s(Year)1                   -0.1323070  0.5492672  -0.241    0.811
s(Year)2                   -0.4817526  0.4983039  -0.967    0.339
s(Year)3                   -0.0744738  0.5670069  -0.131    0.896
s(Year)4                   -0.3888787  0.5040379  -0.772    0.445
s(Year)5                   -0.0006305  0.5321625  -0.001    0.999
s(Year)6                   -0.0066862  0.5116620  -0.013    0.990
s(Year)7                   -0.2903559  0.5197831  -0.559    0.579
s(Year)8                   -0.1463314  0.5268193  -0.278    0.783
s(Year)9                   -0.2777969  0.5277732  -0.526    0.601
s(Year)10                  -0.1956399  0.5140284  -0.381    0.705
PartyDemocratic-Republican -0.0118565  0.1774551  -0.067    0.947
PartyFederalist            -0.1412754  0.2999661  -0.471    0.640
Partynone                  -0.2698651  0.4269787  -0.632    0.531
PartyRepublican            -0.0726179  0.0497082  -1.461    0.151
PartyWhig                  -0.0924995  0.1172027  -0.789    0.434

Topic 6:

                            Estimate Std. Error t value Pr(>|t|)
(Intercept)                 0.186685   0.418899   0.446    0.658
s(Year)1                   -0.003462   0.471555  -0.007    0.994
s(Year)2                   -0.190691   0.421232  -0.453    0.653
s(Year)3                   -0.159271   0.474074  -0.336    0.739
s(Year)4                   -0.220197   0.426641  -0.516    0.608
s(Year)5                   -0.241876   0.440493  -0.549    0.586
s(Year)6                   -0.065019   0.429880  -0.151    0.880
s(Year)7                    0.398757   0.442540   0.901    0.373
s(Year)8                   -0.572770   0.447366  -1.280    0.207
s(Year)9                    0.407283   0.455279   0.895    0.376
s(Year)10                  -0.289771   0.432293  -0.670    0.506
PartyDemocratic-Republican -0.008977   0.151854  -0.059    0.953
PartyFederalist            -0.075178   0.254177  -0.296    0.769
Partynone                  -0.168644   0.359689  -0.469    0.642
PartyRepublican             0.068664   0.043387   1.583    0.121
PartyWhig                   0.010496   0.098449   0.107    0.916

Topic 7:

                            Estimate Std. Error t value Pr(>|t|)
(Intercept)                 0.096747   0.379711   0.255    0.800
s(Year)1                    0.018962   0.429947   0.044    0.965
s(Year)2                   -0.046506   0.378933  -0.123    0.903
s(Year)3                    0.010050   0.438249   0.023    0.982
s(Year)4                   -0.107207   0.390743  -0.274    0.785
s(Year)5                    0.107546   0.404945   0.266    0.792
s(Year)6                   -0.072010   0.393124  -0.183    0.856
s(Year)7                   -0.042701   0.398436  -0.107    0.915
s(Year)8                    0.174676   0.407118   0.429    0.670
s(Year)9                   -0.236162   0.404217  -0.584    0.562
s(Year)10                  -0.042249   0.391550  -0.108    0.915
PartyDemocratic-Republican -0.045572   0.147148  -0.310    0.758
PartyFederalist            -0.078189   0.226204  -0.346    0.731
Partynone                  -0.081952   0.325836  -0.252    0.803
PartyRepublican             0.033166   0.040568   0.818    0.418
PartyWhig                  -0.003141   0.094894  -0.033    0.974

Topic 8:

                            Estimate Std. Error t value Pr(>|t|)
(Intercept)                 0.058007   0.346890   0.167    0.868
s(Year)1                   -0.054100   0.398536  -0.136    0.893
s(Year)2                   -0.110797   0.345581  -0.321    0.750
s(Year)3                    0.074318   0.395220   0.188    0.852
s(Year)4                   -0.109914   0.354446  -0.310    0.758
s(Year)5                    0.096857   0.367451   0.264    0.793
s(Year)6                    0.199124   0.356652   0.558    0.580
s(Year)7                   -0.093319   0.361749  -0.258    0.798
s(Year)8                    0.046202   0.373882   0.124    0.902
s(Year)9                   -0.047498   0.369585  -0.129    0.898
s(Year)10                  -0.019955   0.357884  -0.056    0.956
PartyDemocratic-Republican  0.032947   0.127849   0.258    0.798
PartyFederalist            -0.007213   0.202854  -0.036    0.972
Partynone                   0.025119   0.294464   0.085    0.932
PartyRepublican             0.034428   0.035829   0.961    0.342
PartyWhig                  -0.049514   0.082733  -0.598    0.553

Topic 9:

                            Estimate Std. Error t value Pr(>|t|)
(Intercept)                 0.333138   0.355843   0.936    0.354
s(Year)1                   -0.262287   0.394364  -0.665    0.510
s(Year)2                   -0.227670   0.368189  -0.618    0.540
s(Year)3                    0.058907   0.412951   0.143    0.887
s(Year)4                   -0.266731   0.357028  -0.747    0.459
s(Year)5                   -0.346621   0.370819  -0.935    0.355
s(Year)6                   -0.334761   0.361622  -0.926    0.360
s(Year)7                   -0.319509   0.367711  -0.869    0.390
s(Year)8                   -0.282890   0.371225  -0.762    0.450
s(Year)9                   -0.318967   0.372158  -0.857    0.396
s(Year)10                  -0.301906   0.364843  -0.827    0.413
PartyDemocratic-Republican  0.003670   0.141389   0.026    0.979
PartyFederalist            -0.070476   0.220473  -0.320    0.751
Partynone                  -0.176665   0.301264  -0.586    0.561
PartyRepublican             0.005748   0.031769   0.181    0.857
PartyWhig                   0.098983   0.110352   0.897    0.375


The eight topic looks slightly more prevalent in Democrats than Republican inaugural addresses.

labelTopics(inaug_stm_fit, 8)
Topic 8 Top Words:
     Highest Prob: must, can, change, make, government, one, great 
     FREX: change, must, old, action, order, among, continue 
     Lift: change, old, order, action, hold, continue, maintain 
     Score: change, old, must, america, action, policy, order 
plot(inaug_stm_fit, type = "summary", xlim = c(0, 0.5))

thoughts8 <- findThoughts(inaug_stm_fit,
                          texts = usa_inaugural_df$text,
                          n = 2,
                          topics = 8)$docs[[1]]

plotQuote(thoughts8, width = 30, main = "Topic 6")

inaug_stm_fit_corr <- topicCorr(inaug_stm_fit)

Seeded Topic Models

guardian <- read.csv("http://www.luigicurini.com/uploads/6/7/9/8/67985527/guardian.csv")

Create a small dictionary of keywords (seed words) to define the desired topics.

imm_frames <- dictionary(list(securitarian = c("control", "border", 
                                               "police", "detention",
                                               "illegal", "legal"),
                        humanitarian = c("asylum", "child", 
                                         "seeker", "refugee",
                                         "human", "right")))

Dictionary object with 2 key entries.
- [securitarian]:
  - control, border, police, detention, illegal, legal
- [humanitarian]:
  - asylum, child, seeker, refugee, human, right

Fit the model.

slda <- textmodel_seededlda(ukimmig2010_dfm, imm_frames, residual = TRUE)
print(terms(slda, 20))
      securitarian humanitarian  other        
 [1,] "border"     "asylum"      "system"     
 [2,] "control"    "british"     "people"     
 [3,] "country"    "right"       "new"        
 [4,] "illegal"    "immigrant"   "can"        
 [5,] "work"       "much"        "ensure"     
 [6,] "police"     "seeker"      "citizenship"
 [7,] "eu"         "year"        "need"       
 [8,] "detention"  "uk"          "end"        
 [9,] "government" "child"       "make"       
[10,] "student"    "people"      "migrant"    
[11,] "uk"         "refugee"     "point"      
[12,] "national"   "act"         "agency"     
[13,] "must"       "take"        "house"      
[14,] "live"       "allow"       "high"       
[15,] "non"        "national"    "support"    
[16,] "citizen"    "benefit"     "british"    
[17,] "limit"      "arrive"      "give"       
[18,] "ensure"     "citizenship" "thousand"   
[19,] "future"     "high"        "economy"    
[20,] "economic"   "work"        "priority"   

Check which documents are assigned to which topic.


securitarian humanitarian        other 
           3            4            2 

Laboratory with real-world data


  1. Griffiths, T. L., & Steyvers, M. (2004). Finding scientific topicsProceedings of the National academy of Sciences101(suppl_1), 5228-5235.↩︎