# Here is the raw data from Data Carpentry's post-workshop surveys. The open ended questions are:

# Please list the major strengths of this workshop.
# Please list the ways the workshop could be improved
# Were there any accessibility issues that affected your ability to participate in this workshop? If you answered yes to the question above, please describe what the issues were.
# Please provide an example of how an instructor or helper affected your learning experience.
# 

# I am working on a mock-up report that will be sent to instructors after their workshop and trainers after their instructor training. It will include a synopsis of their post-workshop surveys and resources to help them improve (if necessary). We want to be able to pull out the important parts of these open-ended responses so that we can give instructors/trainers good feedback. We also want to be able to address accessibility issues if there are any. Does that help?

# prepare the text -----------------------------------------

# read in the data
dat <- readr::read_csv("180308_postARCHIVED.csv")
## Warning: Missing column names filled in: 'X14' [14], 'X18' [18],
## 'X19' [19], 'X20' [20], 'X21' [21], 'X22' [22], 'X23' [23], 'X24' [24],
## 'X25' [25], 'X26' [26], 'X32' [32], 'X33' [33], 'X34' [34]
## Warning: Duplicated column names deduplicated: 'Which workshop did you
## attend?' => 'Which workshop did you attend?_1' [13], 'Please rate your
## level of agreement with the following statements:' => 'Please rate your
## level of agreement with the following statements:_1' [31]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Respondent ID` = col_double(),
##   `Collector ID` = col_double(),
##   `Email Address` = col_logical(),
##   `First Name` = col_logical(),
##   `Last Name` = col_logical(),
##   `Custom Data 1` = col_logical()
## )
## See spec(...) for full column specifications.
# clean the col names a bit
names(dat) <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", names(dat))

# get the open-answer questions
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.2.1 --
## v ggplot2 2.2.1.9000     v purrr   0.2.4     
## v tibble  1.4.2          v dplyr   0.7.4     
## v tidyr   0.8.0.9000     v stringr 1.3.0     
## v readr   1.2.0          v forcats 0.3.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::vars()   masks ggplot2::vars()
please <- 
dat %>% 
  select(`Respondent ID`,
         starts_with("Please list"), 
         starts_with("Were there"),
         starts_with("Please provide"))

# Major strengths -------------------------------------------

# get the first one by itself
major_strengths <- 
  please %>% 
  select(`Respondent ID`, 
         `Please list the major strengths of this workshop?`) %>% 
  rename(text = `Please list the major strengths of this workshop?`)

# analyse the text in reponses
library(tidytext)
# I am working through some of the examples here:
# https://www.tidytextmining.com

major_strengths_words <- 
major_strengths %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
# most frequent words
library(wordcloud)
## Loading required package: RColorBrewer
major_strengths_words %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

# sentiment analysis

# how many +ve responses? How many -ve?
major_strengths_words %>% 
  inner_join(get_sentiments("bing")) %>% 
  group_by(`Respondent ID`, sentiment) %>% 
  tally() %>% 
  ungroup() %>% 
  group_by(sentiment) %>% 
  tally()
## Joining, by = "word"
## Using `n` as weighting variable
## # A tibble: 2 x 2
##   sentiment    nn
##   <chr>     <int>
## 1 negative     46
## 2 positive    354
# plot words important to sentiment
major_strengths_words %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() %>% 
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Joining, by = "word"
## Selecting by n

# bigrams ---------------------------------------------

# get top bigrams of most +ve and most -ve responses
major_strengths_sentiments <- 
major_strengths_words %>% 
  inner_join(get_sentiments("bing")) %>% 
  count( `Respondent ID`, sentiment) %>%
  spread(sentiment, n, fill = 0 ) %>% 
  mutate(sentiment = positive - negative) %>% 
  arrange(desc(sentiment))
## Joining, by = "word"
# top bigrams

## bigrams for the most positive responses
major_strengths_bigrams_positive <- 
major_strengths %>% 
unnest_tokens(bigram, 
              text, 
              token = "ngrams", 
              n = 2) %>% 
  left_join(major_strengths_sentiments) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(sentiment > 0) %>% 
  arrange(desc(sentiment)) %>% 
  count(word1, word2, sort = TRUE)
## Joining, by = "Respondent ID"
major_strengths_bigrams_positive
## # A tibble: 493 x 3
##    word1         word2            n
##    <chr>         <chr>        <int>
##  1 knowledgeable instructors      6
##  2 data          analysis         5
##  3 cover         page             4
##  4 time          management       4
##  5 enthusiastic  instructors      3
##  6 friendly      atmosphere       3
##  7 nice          atmosphere       3
##  8 prior         knowledge        3
##  9 step          instructions     3
## 10 answer        questions        2
## # ... with 483 more rows
## bigrams for the most negative responses
major_strengths_bigrams_negative <- 
  major_strengths %>% 
  unnest_tokens(bigram, 
                text, 
                token = "ngrams", 
                n = 2) %>% 
  left_join(major_strengths_sentiments) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(sentiment < 0) %>% 
  arrange(desc(sentiment)) %>% 
  count(word1, word2, sort = TRUE)
## Joining, by = "Respondent ID"
major_strengths_bigrams_negative
## # A tibble: 27 x 3
##    word1    word2          n
##    <chr>    <chr>      <int>
##  1 added    advanced       1
##  2 advance  skills         1
##  3 advanced actives        1
##  4 beginner level          1
##  5 boring   people         1
##  6 cheat    sheets         1
##  7 complex  scripts        1
##  8 covered  background     1
##  9 entire   pipeline       1
## 10 graphic  plot           1
## # ... with 17 more rows
# Improvement ------------------------------------------------

improved <- 
  please %>% 
  select(`Respondent ID`, 
         `Please list the ways the workshop could be improved?`) %>% 
  rename(text = `Please list the ways the workshop could be improved?`)

improved_words <- 
  improved %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
# most frequent words

improved_words %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

# top bigrams
improved %>% 
  unnest_tokens(bigram, 
                text, 
                token = "ngrams", 
                n = 2) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  count(word1, word2, sort = TRUE)
## # A tibble: 691 x 3
##    word1  word2          n
##    <chr>  <chr>      <int>
##  1 2      days           7
##  2 data   analysis       6
##  3 pre    workshop       4
##  4 3      day            3
##  5 3      days           3
##  6 data   management     3
##  7 day    workshop       3
##  8 half   day            3
##  9 social science        3
## 10 time   spent          3
## # ... with 681 more rows
# topic model
improved_words_dtm <- 
improved_words %>%
  group_by(`Respondent ID`) %>% 
  count(word) %>% 
  filter(str_detect(word,"[^0-9]")) %>% 
  cast_dtm(`Respondent ID`, word, n)

library(topicmodels)
library(Rmpfr)
## Loading required package: gmp
## 
## Attaching package: 'gmp'
## The following objects are masked from 'package:base':
## 
##     %*%, apply, crossprod, matrix, tcrossprod
## C code of R package 'Rmpfr': GMP using 64 bits per limb
## 
## Attaching package: 'Rmpfr'
## The following objects are masked from 'package:stats':
## 
##     dbinom, dnorm, dpois, pnorm
## The following objects are masked from 'package:base':
## 
##     cbind, pmax, pmin, rbind
library(ldatuning)
# how many topics are suitable? This takes a few moments...
how_many_topics <- FindTopicsNumber(improved_words_dtm)

improved_lda <- LDA(improved_words_dtm, 
                    k = how_many_topics[1,1], 
                    control = list(seed = 1234))

# plot main words in each topic
improved_lda %>% 
  tidy(matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

# accessibility ------------------------------------

#only look at rows where there were accessibility issues
accessibility <- 
  please %>% 
  select(`Respondent ID`, 
         `Were there any accessibility issues that affected your ability to participate in this workshop?`,
         `Please list the ways the workshop could be improved?`) %>% 
  filter(`Were there any accessibility issues that affected your ability to participate in this workshop?` %in% c("yes", "Yes")) %>% 
  rename(text = `Please list the ways the workshop could be improved?`) %>% 
  select(-`Were there any accessibility issues that affected your ability to participate in this workshop?`)

accessibility_words <- 
  accessibility %>% 
  unnest_tokens(word, text) %>% 
  filter(!is.na(word)) %>% 
  anti_join(stop_words)
## Joining, by = "word"
# most frequent words

accessibility_words %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

# top bigrams
accessibility %>% 
  unnest_tokens(bigram, 
                text, 
                token = "ngrams", 
                n = 2) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  count(word1, word2, sort = TRUE)
## # A tibble: 83 x 3
##    word1    word2        n
##    <chr>    <chr>    <int>
##  1 2        days         2
##  2 1        day          1
##  3 1        hour         1
##  4 10       minutes      1
##  5 3        day          1
##  6 3        days         1
##  7 5        10           1
##  8 9        30           1
##  9 analysis examples     1
## 10 beginner level        1
## # ... with 73 more rows
# topic model
accessibility_words_dtm <- 
  accessibility_words %>%
  group_by(`Respondent ID`) %>% 
  count(word) %>% 
  filter(str_detect(word,"[^0-9]")) %>% 
  cast_dtm(`Respondent ID`, word, n)

library(topicmodels)
library(Rmpfr)
library(ldatuning)
# how many topics are suitable? This takes a few moments...
how_many_topics <- FindTopicsNumber(accessibility_words_dtm)

accessibility_lda <- LDA(accessibility_words_dtm, 
                      k = how_many_topics[1,1], 
                      control = list(seed = 1234))

# plot main words in each topic
accessibility_lda %>% 
  tidy(matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

# experience  ------------------------------------

experience <- 
  please %>% 
  select(`Respondent ID`, 
         `Please provide an example of how an instructor or helper affected your learning experience`) %>% 
  rename(text = `Please provide an example of how an instructor or helper affected your learning experience`)

experience_words <- 
  experience %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
# most frequent words

experience_words %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

# top bigrams
experience %>% 
  unnest_tokens(bigram, 
                text, 
                token = "ngrams", 
                n = 2) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  count(word1, word2, sort = TRUE)
## # A tibble: 397 x 3
##    word1       word2          n
##    <chr>       <chr>      <int>
##  1 answer      questions      7
##  2 answering   questions      3
##  3 command     line           3
##  4 instructors helpers        3
##  5 learning    experience     3
##  6 red         post           3
##  7 answered    quickly        2
##  8 computer    geniuses       2
##  9 data        analysis       2
## 10 data        set            2
## # ... with 387 more rows
# topic model
experience_words_dtm <- 
  experience_words %>%
  group_by(`Respondent ID`) %>% 
  count(word) %>% 
  filter(str_detect(word,"[^0-9]")) %>% 
  cast_dtm(`Respondent ID`, word, n)

library(topicmodels)
library(Rmpfr)
library(ldatuning)
# how many topics are suitable? This takes a few moments...
how_many_topics <- FindTopicsNumber(experience_words_dtm)

experience_lda <- LDA(experience_words_dtm, 
                    k = how_many_topics[1,1], 
                    control = list(seed = 1234))

# plot main words in each topic
experience_lda %>% 
  tidy(matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()