Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###################################################################################################
- # Women of 2020 - Word clouds of The BBC's 100 Women of 2020.
- # Created by: reddit.com/user/brianhaas19
- # Notes: This code is a bit hacky, particularly the section for cleaning the tokens. Using functions to clean up
- # the tokens wasn't producing a nice result so I ended up going with manual cleaning. For a larger data set
- # this would be unfeasible and an automatic process would be required.
- # Also creating the four plots manually is inefficient. If I spent more time on this I would try to assemble them
- # with a loop or purrr::map() function.
- # Link to data
- # https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-12-08
- # Setup
- library(tidyverse)
- library(scales)
- library(patchwork)
- library(tidytext)
- library(SnowballC)
- library(ggwordcloud)
- library(cowplot)
- library(magick)
- theme_set(theme_minimal())
- # Colors (mostly taken from the BBC article)
- light_grey <- "grey70"
- dark_grey <- "#404040"
- colors = c("Knowledge" = "#5AC2DE",
- "Leadership" = "#EE741C",
- "Creativity" = "#D04592",
- "Identity" = "#34AA4D")
- colors_df <- data.frame(
- category = names(colors),
- color = colors
- )
- # Load data
- ### Read in raw data:
- women <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-12-08/women.csv')
- # Clean data
- ### Get tokens from raw data and remove stop words:
- word_count_raw <- unnest_tokens(women, word, description) %>% # word is the output column, description is the input column
- anti_join(get_stopwords(), by = "word") %>%
- filter(category != "All") %>%
- group_by(category) %>%
- count(word)
- ### Clean up tokens:
- word_count_raw$word[word_count_raw$word %>% str_detect("^\\d*$")] <- NA_character_
- word_count_raw$word[word_count_raw$word == "5,000"] <- NA_character_
- word_count_raw$word[word_count_raw$word == "activists"] <- "activist"
- word_count_raw$word[word_count_raw$word == "accessible"] <- "access"
- word_count_raw$word[word_count_raw$word == "accusations"] <- "accused"
- word_count_raw$word[word_count_raw$word == "actresses"] <- "actress"
- word_count_raw$word[word_count_raw$word == "advance"] <- "advancing"
- word_count_raw$word[word_count_raw$word == "advanced"] <- "advancing"
- word_count_raw$word[word_count_raw$word == "advocate"] <- "advocacy"
- word_count_raw$word[word_count_raw$word == "advocating"] <- "advocacy"
- word_count_raw$word[word_count_raw$word == "afghan"] <- "afghanistan"
- word_count_raw$word[word_count_raw$word == "afghanistan’s"] <- "afghanistan"
- word_count_raw$word[word_count_raw$word == "african"] <- "africa"
- word_count_raw$word[word_count_raw$word == "afrika"] <- "africa"
- word_count_raw$word[word_count_raw$word == "afrika"] <- "africa"
- word_count_raw$word[word_count_raw$word == "ageing"] <- "age"
- word_count_raw$word[word_count_raw$word == "aim"] <- "aims"
- word_count_raw$word[word_count_raw$word == "also"] <- NA_character_
- word_count_raw$word[word_count_raw$word == "american"] <- "america"
- word_count_raw$word[word_count_raw$word == "americas"] <- "america"
- word_count_raw$word[word_count_raw$word == "archaeological"] <- "archaeology"
- word_count_raw$word[word_count_raw$word == "argentine"] <- "argentinian"
- word_count_raw$word[word_count_raw$word == "arts"] <- "art"
- word_count_raw$word[word_count_raw$word == "artist"] <- "art"
- word_count_raw$word[word_count_raw$word == "artists"] <- "art"
- word_count_raw$word[word_count_raw$word == "asian"] <- "asia"
- word_count_raw$word[word_count_raw$word == "became"] <- "becoming"
- word_count_raw$word[word_count_raw$word == "biological"] <- "biology"
- word_count_raw$word[word_count_raw$word == "book"] <- "books"
- word_count_raw$word[word_count_raw$word == "breweries"] <- "brewing"
- word_count_raw$word[word_count_raw$word == "built"] <- "building"
- word_count_raw$word[word_count_raw$word == "campaigns"] <- "campaign"
- word_count_raw$word[word_count_raw$word == "centre"] <- "center"
- word_count_raw$word[word_count_raw$word == "centres"] <- "center"
- word_count_raw$word[word_count_raw$word == "changing"] <- "change"
- word_count_raw$word[word_count_raw$word == "child"] <- "children"
- word_count_raw$word[word_count_raw$word == "children's"] <- "children"
- word_count_raw$word[word_count_raw$word == "children’s"] <- "children"
- word_count_raw$word[word_count_raw$word == "chilean"] <- "chile"
- word_count_raw$word[word_count_raw$word == "club's"] <- "club"
- word_count_raw$word[word_count_raw$word == "collaborates"] <- "collaborate"
- word_count_raw$word[word_count_raw$word == "collaborated"] <- "collaborate"
- word_count_raw$word[word_count_raw$word == "come"] <- "coming"
- word_count_raw$word[word_count_raw$word == "communicator"] <- "communicate"
- word_count_raw$word[word_count_raw$word == "communities"] <- "community"
- word_count_raw$word[word_count_raw$word == "conflicts"] <- "conflict"
- word_count_raw$word[word_count_raw$word == "contributed"] <- "contribution"
- word_count_raw$word[word_count_raw$word == "coronavirus"] <- "covid"
- word_count_raw$word[word_count_raw$word == "countries"] <- "country"
- word_count_raw$word[word_count_raw$word == "country’s"] <- "collaborate"
- word_count_raw$word[word_count_raw$word == "creating"] <- "create"
- word_count_raw$word[word_count_raw$word == "creator"] <- "create"
- word_count_raw$word[word_count_raw$word == "culturally"] <- "culture"
- word_count_raw$word[word_count_raw$word == "cultural"] <- "culture"
- word_count_raw$word[word_count_raw$word == "currently"] <- "current"
- word_count_raw$word[word_count_raw$word == "debates"] <- "debate"
- word_count_raw$word[word_count_raw$word == "defending"] <- "defender"
- word_count_raw$word[word_count_raw$word == "democratic"] <- "democracy"
- word_count_raw$word[word_count_raw$word == "democratic"] <- "democracy"
- word_count_raw$word[word_count_raw$word == "describes"] <- "describe"
- word_count_raw$word[word_count_raw$word == "described"] <- "describe"
- word_count_raw$word[word_count_raw$word == "developing"] <- "develope"
- word_count_raw$word[word_count_raw$word == "development"] <- "develope"
- word_count_raw$word[word_count_raw$word == "directed"] <- "director"
- word_count_raw$word[word_count_raw$word == "directing"] <- "director"
- word_count_raw$word[word_count_raw$word == "disabilities"] <- "disability"
- word_count_raw$word[word_count_raw$word == "disabled"] <- "disability"
- word_count_raw$word[word_count_raw$word == "diseases"] <- "disease"
- word_count_raw$word[word_count_raw$word == "documents"] <- "documentary"
- word_count_raw$word[word_count_raw$word == "documenting"] <- "documentary"
- word_count_raw$word[word_count_raw$word == "dr"] <- "DR"
- word_count_raw$word[word_count_raw$word == "educating"] <- "education"
- word_count_raw$word[word_count_raw$word == "educator"] <- "education"
- word_count_raw$word[word_count_raw$word == "empower"] <- "empowerment"
- word_count_raw$word[word_count_raw$word == "empowered"] <- "empowerment"
- word_count_raw$word[word_count_raw$word == "engineer"] <- "engineering"
- word_count_raw$word[word_count_raw$word == "entrepreneur"] <- "entrepreneurship"
- word_count_raw$word[word_count_raw$word == "environmental"] <- "environment"
- word_count_raw$word[word_count_raw$word == "ethiopian"] <- "ethiopia"
- word_count_raw$word[word_count_raw$word == "experienced"] <- "experience"
- word_count_raw$word[word_count_raw$word == "experiences"] <- "experience"
- word_count_raw$word[word_count_raw$word == "eyes"] <- "eye"
- word_count_raw$word[word_count_raw$word == "feminists"] <- "feminist"
- word_count_raw$word[word_count_raw$word == "festivals"] <- "festival"
- word_count_raw$word[word_count_raw$word == "films"] <- "film"
- word_count_raw$word[word_count_raw$word == "focusing"] <- "focus"
- word_count_raw$word[word_count_raw$word == "focuses"] <- "focus"
- word_count_raw$word[word_count_raw$word == "footballer"] <- "football"
- word_count_raw$word[word_count_raw$word == "forces"] <- "force"
- word_count_raw$word[word_count_raw$word == "formerly"] <- "former"
- word_count_raw$word[word_count_raw$word == "found"] <- "founder"
- word_count_raw$word[word_count_raw$word == "founded"] <- "founder"
- word_count_raw$word[word_count_raw$word == "founders"] <- "founder"
- word_count_raw$word[word_count_raw$word == "girl"] <- "girls"
- word_count_raw$word[word_count_raw$word == "governance"] <- "government"
- word_count_raw$word[word_count_raw$word == "governmental"] <- "government"
- word_count_raw$word[word_count_raw$word == "governments"] <- "government"
- word_count_raw$word[word_count_raw$word == "governing"] <- "government"
- word_count_raw$word[word_count_raw$word == "groups"] <- "group"
- word_count_raw$word[word_count_raw$word == "heads"] <- "head"
- word_count_raw$word[word_count_raw$word == "helps"] <- "help"
- word_count_raw$word[word_count_raw$word == "helped"] <- "help"
- word_count_raw$word[word_count_raw$word == "helping"] <- "help"
- word_count_raw$word[word_count_raw$word == "helpers"] <- "help"
- word_count_raw$word[word_count_raw$word == "improved"] <- "improving"
- word_count_raw$word[word_count_raw$word == "inclusive"] <- "include"
- word_count_raw$word[word_count_raw$word == "includes"] <- "include"
- word_count_raw$word[word_count_raw$word == "including"] <- "include"
- word_count_raw$word[word_count_raw$word == "industries"] <- "industry"
- word_count_raw$word[word_count_raw$word == "industrial"] <- "industry"
- word_count_raw$word[word_count_raw$word == "influences"] <- "influence"
- word_count_raw$word[word_count_raw$word == "influential"] <- "influence"
- word_count_raw$word[word_count_raw$word == "institutes"] <- "institute"
- word_count_raw$word[word_count_raw$word == "internationally"] <- "international"
- word_count_raw$word[word_count_raw$word == "invented"] <- "invent"
- word_count_raw$word[word_count_raw$word == "invention"] <- "invent"
- word_count_raw$word[word_count_raw$word == "inventions"] <- "invent"
- word_count_raw$word[word_count_raw$word == "involved"] <- "involvement"
- word_count_raw$word[word_count_raw$word == "improve"] <- "improving"
- word_count_raw$word[word_count_raw$word == "iranian"] <- "iran"
- word_count_raw$word[word_count_raw$word == "irish"] <- "ireland"
- word_count_raw$word[word_count_raw$word == "journalist"] <- "journalism"
- word_count_raw$word[word_count_raw$word == "job"] <- "jobs"
- word_count_raw$word[word_count_raw$word == "known"] <- "know"
- word_count_raw$word[word_count_raw$word == "kyrgyz"] <- "kyrgyzstan"
- word_count_raw$word[word_count_raw$word == "lead"] <- "leader"
- word_count_raw$word[word_count_raw$word == "leads"] <- "leader"
- word_count_raw$word[word_count_raw$word == "leaders"] <- "leader"
- word_count_raw$word[word_count_raw$word == "leadership"] <- "leader"
- word_count_raw$word[word_count_raw$word == "life’s"] <- "life"
- word_count_raw$word[word_count_raw$word == "lives"] <- "life"
- word_count_raw$word[word_count_raw$word == "lines"] <- "line"
- word_count_raw$word[word_count_raw$word == "locally"] <- "local"
- word_count_raw$word[word_count_raw$word == "looks"] <- "look"
- word_count_raw$word[word_count_raw$word == "looking"] <- "look"
- word_count_raw$word[word_count_raw$word == "lyrical"] <- "lyrics"
- word_count_raw$word[word_count_raw$word == "maker"] <- "make"
- word_count_raw$word[word_count_raw$word == "making"] <- "make"
- word_count_raw$word[word_count_raw$word == "makers"] <- "make"
- word_count_raw$word[word_count_raw$word == "manages"] <- "manage"
- word_count_raw$word[word_count_raw$word == "manager"] <- "manage"
- word_count_raw$word[word_count_raw$word == "medical"] <- "medicine"
- word_count_raw$word[word_count_raw$word == "ministry"] <- "minister"
- word_count_raw$word[word_count_raw$word == "models"] <- "model"
- word_count_raw$word[word_count_raw$word == "movements"] <- "movement"
- word_count_raw$word[word_count_raw$word == "musician"] <- "music"
- word_count_raw$word[word_count_raw$word == "named"] <- "name"
- word_count_raw$word[word_count_raw$word == "nation"] <- "national"
- word_count_raw$word[word_count_raw$word == "networks"] <- "network"
- word_count_raw$word[word_count_raw$word == "nigerian"] <- "nigeria"
- word_count_raw$word[word_count_raw$word == "organisations"] <- "organisation"
- word_count_raw$word[word_count_raw$word == "outbreaks"] <- "outbreak"
- word_count_raw$word[word_count_raw$word == "pakistanis"] <- "pakistan"
- word_count_raw$word[word_count_raw$word == "parliamentary"] <- "parliament"
- word_count_raw$word[word_count_raw$word == "parties"] <- "party"
- word_count_raw$word[word_count_raw$word == "peacebuilders"] <- "peace"
- word_count_raw$word[word_count_raw$word == "peacefully"] <- "peace"
- word_count_raw$word[word_count_raw$word == "people’s"] <- "people"
- word_count_raw$word[word_count_raw$word == "performs"] <- "performing"
- word_count_raw$word[word_count_raw$word == "petitioners"] <- "petition"
- word_count_raw$word[word_count_raw$word == "pioneered"] <- "pioneer"
- word_count_raw$word[word_count_raw$word == "presidential"] <- "president"
- word_count_raw$word[word_count_raw$word == "preventive"] <- "prevention"
- word_count_raw$word[word_count_raw$word == "prevent"] <- "prevention"
- word_count_raw$word[word_count_raw$word == "prisoners"] <- "prison"
- word_count_raw$word[word_count_raw$word == "promote"] <- "promoting"
- word_count_raw$word[word_count_raw$word == "protect"] <- "protection"
- word_count_raw$word[word_count_raw$word == "protested"] <- "protest"
- word_count_raw$word[word_count_raw$word == "protester"] <- "protest"
- word_count_raw$word[word_count_raw$word == "protests"] <- "protest"
- word_count_raw$word[word_count_raw$word == "recorded"] <- "record"
- word_count_raw$word[word_count_raw$word == "reported"] <- "reporting"
- word_count_raw$word[word_count_raw$word == "reports"] <- "reporting"
- word_count_raw$word[word_count_raw$word == "researchers"] <- "research"
- word_count_raw$word[word_count_raw$word == "researches"] <- "research"
- word_count_raw$word[word_count_raw$word == "result"] <- "results"
- word_count_raw$word[word_count_raw$word == "right"] <- "rights"
- word_count_raw$word[word_count_raw$word == "russia's"] <- "russia"
- word_count_raw$word[word_count_raw$word == "russian"] <- "russia"
- word_count_raw$word[word_count_raw$word == "schoolchildren"] <- "children"
- word_count_raw$word[word_count_raw$word == "sciences"] <- "science"
- word_count_raw$word[word_count_raw$word == "scientist"] <- "science"
- word_count_raw$word[word_count_raw$word == "scientists"] <- "science"
- word_count_raw$word[word_count_raw$word == "scientists"] <- "science"
- word_count_raw$word[word_count_raw$word == "scots"] <- "scotland"
- word_count_raw$word[word_count_raw$word == "scottish"] <- "scotland"
- word_count_raw$word[word_count_raw$word == "screened"] <- "screen"
- word_count_raw$word[word_count_raw$word == "scientists"] <- "science"
- word_count_raw$word[word_count_raw$word == "sexual"] <- "sex"
- word_count_raw$word[word_count_raw$word == "showed"] <- "show"
- word_count_raw$word[word_count_raw$word == "singer"] <- "singing"
- word_count_raw$word[word_count_raw$word == "stories"] <- "story"
- word_count_raw$word[word_count_raw$word == "storyteller"] <- "story"
- word_count_raw$word[word_count_raw$word == "students"] <- "student"
- word_count_raw$word[word_count_raw$word == "studying"] <- "study"
- word_count_raw$word[word_count_raw$word == "supported"] <- "support"
- word_count_raw$word[word_count_raw$word == "supporting"] <- "support"
- word_count_raw$word[word_count_raw$word == "supports"] <- "support"
- word_count_raw$word[word_count_raw$word == "syrian"] <- "syria"
- word_count_raw$word[word_count_raw$word == "systems"] <- "system"
- word_count_raw$word[word_count_raw$word == "tackling"] <- "tackle"
- word_count_raw$word[word_count_raw$word == "talks"] <- "talk"
- word_count_raw$word[word_count_raw$word == "tech"] <- "technology"
- word_count_raw$word[word_count_raw$word == "technologies"] <- "technology"
- word_count_raw$word[word_count_raw$word == "tools"] <- "tool"
- word_count_raw$word[word_count_raw$word == "transformative"] <- "transform"
- word_count_raw$word[word_count_raw$word == "transforming"] <- "transform"
- word_count_raw$word[word_count_raw$word == "translated"] <- "translate"
- word_count_raw$word[word_count_raw$word == "translator"] <- "translate"
- word_count_raw$word[word_count_raw$word == "travelled"] <- "travel"
- word_count_raw$word[word_count_raw$word == "travellers"] <- "travel"
- word_count_raw$word[word_count_raw$word == "treated"] <- "treat"
- word_count_raw$word[word_count_raw$word == "turkish"] <- "turkey"
- word_count_raw$word[word_count_raw$word == "uk's"] <- "UK"
- word_count_raw$word[word_count_raw$word == "uk’s"] <- "UK"
- word_count_raw$word[word_count_raw$word == "uk"] <- "UK"
- word_count_raw$word[word_count_raw$word == "un"] <- "UN"
- word_count_raw$word[word_count_raw$word == "visualisation"] <- "visual"
- word_count_raw$word[word_count_raw$word == "vulnerability"] <- "vulnerable"
- word_count_raw$word[word_count_raw$word == "weeks"] <- "week"
- word_count_raw$word[word_count_raw$word == "winner"] <- "winning"
- word_count_raw$word[word_count_raw$word == "woman’s"] <- "women"
- word_count_raw$word[word_count_raw$word == "woman"] <- "winning"
- word_count_raw$word[word_count_raw$word == "women's"] <- "winning"
- word_count_raw$word[word_count_raw$word == "women’s"] <- "winning"
- word_count_raw$word[word_count_raw$word == "working"] <- "work"
- word_count_raw$word[word_count_raw$word == "worker"] <- "work"
- word_count_raw$word[word_count_raw$word == "workers"] <- "work"
- word_count_raw$word[word_count_raw$word == "worked"] <- "work"
- word_count_raw$word[word_count_raw$word == "works"] <- "work"
- word_count_raw$word[word_count_raw$word == "writing"] <- "writer"
- word_count_raw$word[word_count_raw$word == "written"] <- "writer"
- word_count_raw$word[word_count_raw$word == "years"] <- "year"
- word_count_raw$word[word_count_raw$word == "young"] <- "youth"
- word_count_raw$word[word_count_raw$word == "zambian"] <- "zambia"
- ### Generate word counts from cleaned data:
- word_count_clean <-
- word_count_raw %>%
- filter(!is.na(word)) %>%
- group_by(category, word) %>%
- summarise(n = sum(n))
- # Visualize
- # Suggested parameters for R notebook: fig.height=9, fig.width=8
- ## Create a plot for each category:
- ### Creativity
- set.seed(42)
- creativity <-
- word_count_clean %>%
- slice_max(n, n = 25) %>%
- filter(category == "Creativity") %>%
- slice_head(n = 25) %>%
- left_join(colors_df, by = "category") %>%
- ggplot(aes(label = word, size = n, color = I(color))) +
- geom_text_wordcloud(shape = "square", eccentricity = 1) +
- scale_size_area(max_size = 12) +
- facet_wrap(~category) +
- theme(panel.border = element_blank(),
- strip.background = element_rect(fill = "white", color = NA),
- strip.text = element_text(size = 22, color = colors["Creativity"], face = "bold.italic"))
- ### Identity
- set.seed(42)
- identity <-
- word_count_clean %>%
- slice_max(n, n = 25) %>%
- filter(category == "Identity") %>%
- slice_head(n = 25) %>%
- left_join(colors_df, by = "category") %>%
- ggplot(aes(label = word, size = n, color = I(color))) +
- geom_text_wordcloud(shape = "square", eccentricity = 1) +
- scale_size_area(max_size = 12) +
- facet_wrap(~category) +
- theme(panel.border = element_blank(),
- strip.background = element_rect(fill = "white", color = NA),
- strip.text = element_text(size = 22, color = colors["Identity"], face = "bold.italic"))
- ### Knowledge
- set.seed(42)
- knowledge <-
- word_count_clean %>%
- slice_max(n, n = 25) %>%
- filter(category == "Knowledge") %>%
- slice_head(n = 25) %>%
- left_join(colors_df, by = "category") %>%
- ggplot(aes(label = word, size = n, color = I(color))) +
- geom_text_wordcloud(shape = "square", eccentricity = 1) +
- scale_size_area(max_size = 10) +
- facet_wrap(~category) +
- theme(panel.border = element_blank(),
- strip.background = element_rect(fill = "white", color = NA),
- strip.text = element_text(size = 22, color = colors["Knowledge"], face = "bold.italic"))
- ### Leadership
- set.seed(42)
- leadership <-
- word_count_clean %>%
- slice_max(n, n = 25) %>%
- filter(category == "Leadership") %>%
- slice_head(n = 25) %>%
- left_join(colors_df, by = "category") %>%
- ggplot(aes(label = word, size = n, color = I(color))) +
- geom_text_wordcloud(shape = "square", eccentricity = 1) +
- scale_size_area(max_size = 10) +
- facet_wrap(~category) +
- theme(panel.border = element_blank(),
- strip.background = element_rect(fill = "white", color = NA),
- strip.text = element_text(size = 22, color = colors["Leadership"], face = "bold.italic"))
- ### Overall theme for patchwork:
- plot_theme <- theme(plot.title = element_text(colour = dark_grey),
- plot.subtitle = element_text(color = dark_grey),
- plot.caption = element_text(color = dark_grey))
- ### Use patchwork to piece together the four plots:
- plot1 <- (creativity + identity) / (knowledge + leadership) +
- plot_annotation(title = expression(paste(bold("Women of 2020"), " - Word clouds of The BBC's 100 Women of 2020.")),
- subtitle = str_wrap("The word clouds are grouped by category and are generated from the text description for each Woman of 2020."),
- caption = expression(paste("Created by: ", italic("reddit.com/user/brianhaas19"), " Data source: ", italic("https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-12-08"))),
- theme = plot_theme)
- ### File path of logo:
- logo_file <- str_c(getwd(), "/img/logo.png")
- ### Add the logo to the plot:
- plot2 <- ggdraw(plot1) +
- draw_image(logo_file, x = 1.15, y = 1,
- hjust = 1, vjust = 1,
- width = 0.4, height = 0.08)
- ### View/save the plot:
- #png(filename = str_c(getwd(), "/women_of_2020.png"), width = 8, height = 9, units = "in", res = 200) # uncomment to save to disk
- plot2
- #dev.off() # uncomment to save to disk
- ###################################################################################################
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement