# Here is the raw data from Data Carpentry's post-workshop surveys. The open ended questions are:
# Please list the major strengths of this workshop.
# Please list the ways the workshop could be improved
# Were there any accessibility issues that affected your ability to participate in this workshop? If you answered yes to the question above, please describe what the issues were.
# Please provide an example of how an instructor or helper affected your learning experience.
#
# I am working on a mock-up report that will be sent to instructors after their workshop and trainers after their instructor training. It will include a synopsis of their post-workshop surveys and resources to help them improve (if necessary). We want to be able to pull out the important parts of these open-ended responses so that we can give instructors/trainers good feedback. We also want to be able to address accessibility issues if there are any. Does that help?
# prepare the text -----------------------------------------
# read in the data
dat <- readr::read_csv("180308_postARCHIVED.csv")
## Warning: Missing column names filled in: 'X14' [14], 'X18' [18],
## 'X19' [19], 'X20' [20], 'X21' [21], 'X22' [22], 'X23' [23], 'X24' [24],
## 'X25' [25], 'X26' [26], 'X32' [32], 'X33' [33], 'X34' [34]
## Warning: Duplicated column names deduplicated: 'Which workshop did you
## attend?' => 'Which workshop did you attend?_1' [13], 'Please rate your
## level of agreement with the following statements:' => 'Please rate your
## level of agreement with the following statements:_1' [31]
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Respondent ID` = col_double(),
## `Collector ID` = col_double(),
## `Email Address` = col_logical(),
## `First Name` = col_logical(),
## `Last Name` = col_logical(),
## `Custom Data 1` = col_logical()
## )
## See spec(...) for full column specifications.
# clean the col names a bit
names(dat) <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", names(dat))
# get the open-answer questions
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.2.1 --
## v ggplot2 2.2.1.9000 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0.9000 v stringr 1.3.0
## v readr 1.2.0 v forcats 0.3.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::vars() masks ggplot2::vars()
please <-
dat %>%
select(`Respondent ID`,
starts_with("Please list"),
starts_with("Were there"),
starts_with("Please provide"))
# Major strengths -------------------------------------------
# get the first one by itself
major_strengths <-
please %>%
select(`Respondent ID`,
`Please list the major strengths of this workshop?`) %>%
rename(text = `Please list the major strengths of this workshop?`)
# analyse the text in reponses
library(tidytext)
# I am working through some of the examples here:
# https://www.tidytextmining.com
major_strengths_words <-
major_strengths %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
# most frequent words
library(wordcloud)
## Loading required package: RColorBrewer
major_strengths_words %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
# sentiment analysis
# how many +ve responses? How many -ve?
major_strengths_words %>%
inner_join(get_sentiments("bing")) %>%
group_by(`Respondent ID`, sentiment) %>%
tally() %>%
ungroup() %>%
group_by(sentiment) %>%
tally()
## Joining, by = "word"
## Using `n` as weighting variable
## # A tibble: 2 x 2
## sentiment nn
## <chr> <int>
## 1 negative 46
## 2 positive 354
# plot words important to sentiment
major_strengths_words %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup() %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Joining, by = "word"
## Selecting by n
# bigrams ---------------------------------------------
# get top bigrams of most +ve and most -ve responses
major_strengths_sentiments <-
major_strengths_words %>%
inner_join(get_sentiments("bing")) %>%
count( `Respondent ID`, sentiment) %>%
spread(sentiment, n, fill = 0 ) %>%
mutate(sentiment = positive - negative) %>%
arrange(desc(sentiment))
## Joining, by = "word"
# top bigrams
## bigrams for the most positive responses
major_strengths_bigrams_positive <-
major_strengths %>%
unnest_tokens(bigram,
text,
token = "ngrams",
n = 2) %>%
left_join(major_strengths_sentiments) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(sentiment > 0) %>%
arrange(desc(sentiment)) %>%
count(word1, word2, sort = TRUE)
## Joining, by = "Respondent ID"
major_strengths_bigrams_positive
## # A tibble: 493 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 knowledgeable instructors 6
## 2 data analysis 5
## 3 cover page 4
## 4 time management 4
## 5 enthusiastic instructors 3
## 6 friendly atmosphere 3
## 7 nice atmosphere 3
## 8 prior knowledge 3
## 9 step instructions 3
## 10 answer questions 2
## # ... with 483 more rows
## bigrams for the most negative responses
major_strengths_bigrams_negative <-
major_strengths %>%
unnest_tokens(bigram,
text,
token = "ngrams",
n = 2) %>%
left_join(major_strengths_sentiments) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(sentiment < 0) %>%
arrange(desc(sentiment)) %>%
count(word1, word2, sort = TRUE)
## Joining, by = "Respondent ID"
major_strengths_bigrams_negative
## # A tibble: 27 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 added advanced 1
## 2 advance skills 1
## 3 advanced actives 1
## 4 beginner level 1
## 5 boring people 1
## 6 cheat sheets 1
## 7 complex scripts 1
## 8 covered background 1
## 9 entire pipeline 1
## 10 graphic plot 1
## # ... with 17 more rows
# Improvement ------------------------------------------------
improved <-
please %>%
select(`Respondent ID`,
`Please list the ways the workshop could be improved?`) %>%
rename(text = `Please list the ways the workshop could be improved?`)
improved_words <-
improved %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
# most frequent words
improved_words %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
# top bigrams
improved %>%
unnest_tokens(bigram,
text,
token = "ngrams",
n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
## # A tibble: 691 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 2 days 7
## 2 data analysis 6
## 3 pre workshop 4
## 4 3 day 3
## 5 3 days 3
## 6 data management 3
## 7 day workshop 3
## 8 half day 3
## 9 social science 3
## 10 time spent 3
## # ... with 681 more rows
# topic model
improved_words_dtm <-
improved_words %>%
group_by(`Respondent ID`) %>%
count(word) %>%
filter(str_detect(word,"[^0-9]")) %>%
cast_dtm(`Respondent ID`, word, n)
library(topicmodels)
library(Rmpfr)
## Loading required package: gmp
##
## Attaching package: 'gmp'
## The following objects are masked from 'package:base':
##
## %*%, apply, crossprod, matrix, tcrossprod
## C code of R package 'Rmpfr': GMP using 64 bits per limb
##
## Attaching package: 'Rmpfr'
## The following objects are masked from 'package:stats':
##
## dbinom, dnorm, dpois, pnorm
## The following objects are masked from 'package:base':
##
## cbind, pmax, pmin, rbind
library(ldatuning)
# how many topics are suitable? This takes a few moments...
how_many_topics <- FindTopicsNumber(improved_words_dtm)
improved_lda <- LDA(improved_words_dtm,
k = how_many_topics[1,1],
control = list(seed = 1234))
# plot main words in each topic
improved_lda %>%
tidy(matrix = "beta") %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
# accessibility ------------------------------------
#only look at rows where there were accessibility issues
accessibility <-
please %>%
select(`Respondent ID`,
`Were there any accessibility issues that affected your ability to participate in this workshop?`,
`Please list the ways the workshop could be improved?`) %>%
filter(`Were there any accessibility issues that affected your ability to participate in this workshop?` %in% c("yes", "Yes")) %>%
rename(text = `Please list the ways the workshop could be improved?`) %>%
select(-`Were there any accessibility issues that affected your ability to participate in this workshop?`)
accessibility_words <-
accessibility %>%
unnest_tokens(word, text) %>%
filter(!is.na(word)) %>%
anti_join(stop_words)
## Joining, by = "word"
# most frequent words
accessibility_words %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
# top bigrams
accessibility %>%
unnest_tokens(bigram,
text,
token = "ngrams",
n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
## # A tibble: 83 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 2 days 2
## 2 1 day 1
## 3 1 hour 1
## 4 10 minutes 1
## 5 3 day 1
## 6 3 days 1
## 7 5 10 1
## 8 9 30 1
## 9 analysis examples 1
## 10 beginner level 1
## # ... with 73 more rows
# topic model
accessibility_words_dtm <-
accessibility_words %>%
group_by(`Respondent ID`) %>%
count(word) %>%
filter(str_detect(word,"[^0-9]")) %>%
cast_dtm(`Respondent ID`, word, n)
library(topicmodels)
library(Rmpfr)
library(ldatuning)
# how many topics are suitable? This takes a few moments...
how_many_topics <- FindTopicsNumber(accessibility_words_dtm)
accessibility_lda <- LDA(accessibility_words_dtm,
k = how_many_topics[1,1],
control = list(seed = 1234))
# plot main words in each topic
accessibility_lda %>%
tidy(matrix = "beta") %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
# experience ------------------------------------
experience <-
please %>%
select(`Respondent ID`,
`Please provide an example of how an instructor or helper affected your learning experience`) %>%
rename(text = `Please provide an example of how an instructor or helper affected your learning experience`)
experience_words <-
experience %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
# most frequent words
experience_words %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
# top bigrams
experience %>%
unnest_tokens(bigram,
text,
token = "ngrams",
n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
## # A tibble: 397 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 answer questions 7
## 2 answering questions 3
## 3 command line 3
## 4 instructors helpers 3
## 5 learning experience 3
## 6 red post 3
## 7 answered quickly 2
## 8 computer geniuses 2
## 9 data analysis 2
## 10 data set 2
## # ... with 387 more rows
# topic model
experience_words_dtm <-
experience_words %>%
group_by(`Respondent ID`) %>%
count(word) %>%
filter(str_detect(word,"[^0-9]")) %>%
cast_dtm(`Respondent ID`, word, n)
library(topicmodels)
library(Rmpfr)
library(ldatuning)
# how many topics are suitable? This takes a few moments...
how_many_topics <- FindTopicsNumber(experience_words_dtm)
experience_lda <- LDA(experience_words_dtm,
k = how_many_topics[1,1],
control = list(seed = 1234))
# plot main words in each topic
experience_lda %>%
tidy(matrix = "beta") %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()