AE 11: Iterating in R
Suggested answers
Packages
We will use the following packages in this application exercise.
- {tidyverse}: For data import, wrangling, and visualization.
- {palmerpenguins}: For the
penguins
dataset - {rvest}: For scraping HTML files.
- {robotstxt}: For verifying if we can scrape a website.
Part 1: Iterating over columns
Your turn: Write a function that summarizes multiple specified columns of a data frame and calculates their arithmetic mean and standard deviation using across()
.
# simple version
my_summary <- function(df, cols) {
df |>
summarize(
across(
.cols = {{ cols }},
.fns = list(
mean = \(x) mean(x, na.rm = TRUE),
sd = \(x) sd(x, na.rm = TRUE)
)
),
.groups = "drop"
)
}
penguins |>
group_by(species) |>
my_summary(ends_with("mm"))
# A tibble: 3 × 7
species bill_length_mm_mean bill_length_mm_sd bill_depth_mm_mean
<fct> <dbl> <dbl> <dbl>
1 Adelie 38.8 2.66 18.3
2 Chinstrap 48.8 3.34 18.4
3 Gentoo 47.5 3.08 15.0
# ℹ 3 more variables: bill_depth_mm_sd <dbl>, flipper_length_mm_mean <dbl>,
# flipper_length_mm_sd <dbl>
# include a default set of columns
my_summary <- function(df, cols = where(is.numeric)) {
df |>
summarize(
across(
.cols = {{ cols }},
.fns = list(
mean = \(x) mean(x, na.rm = TRUE),
sd = \(x) sd(x, na.rm = TRUE)
)
),
.groups = "drop"
)
}
penguins |>
select(-year) |>
my_summary()
# A tibble: 1 × 8
bill_length_mm_mean bill_length_mm_sd bill_depth_mm_mean bill_depth_mm_sd
<dbl> <dbl> <dbl> <dbl>
1 43.9 5.46 17.2 1.97
# ℹ 4 more variables: flipper_length_mm_mean <dbl>, flipper_length_mm_sd <dbl>,
# body_mass_g_mean <dbl>, body_mass_g_sd <dbl>
Part 2: Data scraping
See the code below stored in iterate-cornell-review.R
.
# load packages
library(tidyverse)
library(rvest)
library(robotstxt)
# check that we can scrape data from the cornell review
paths_allowed("https://www.thecornellreview.org/")
# read the first page
page <- read_html("https://www.thecornellreview.org/")
# extract desired components
titles <- html_elements(x = page, css = "#main .read-title a") |>
html_text2()
authors <- html_elements(x = page, css = "#main .byline a") |>
html_text2()
article_dates <- html_elements(x = page, css = "#main .posts-date") |>
html_text2()
topics <- html_elements(x = page, css = "#main .cat-links") |>
html_text2()
abstracts <- html_elements(x = page, css = ".post-description") |>
html_text2()
post_urls <- html_elements(x = page, css = ".aft-readmore") |>
html_attr(name = "href")
# create a tibble with this data
review_raw <- tibble(
title = titles,
author = authors,
date = article_dates,
topic = topics,
description = abstracts,
url = post_urls
)
# clean up the data
review <- review_raw |>
mutate(
date = mdy(date),
description = str_remove(string = description, pattern = "\nRead More")
)
######## write a function to scrape a single page and use a map() function
######## to iterate over the first ten pages
# convert to a function
scrape_review <- function(url){
# pause for a couple of seconds to prevent rapid HTTP requests
Sys.sleep(2)
# read the first page
page <- read_html(url)
# extract desired components
titles <- html_elements(x = page, css = "#main .read-title a") |>
html_text2()
authors <- html_elements(x = page, css = "#main .byline a") |>
html_text2()
article_dates <- html_elements(x = page, css = "#main .posts-date") |>
html_text2()
topics <- html_elements(x = page, css = "#main .cat-links") |>
html_text2()
abstracts <- html_elements(x = page, css = ".post-description") |>
html_text2()
post_urls <- html_elements(x = page, css = ".aft-readmore") |>
html_attr(name = "href")
# create a tibble with this data
review_raw <- tibble(
title = titles,
author = authors,
date = article_dates,
topic = topics,
description = abstracts,
url = post_urls
)
# clean up the data
review <- review_raw |>
mutate(
date = mdy(date),
description = str_remove(string = description, pattern = "\nRead More")
)
# export the resulting data frame
return(review)
}
# test function
## page 1
scrape_review(url = "https://www.thecornellreview.org/page/1/")
## page 2
scrape_review(url = "https://www.thecornellreview.org/page/2/")
# create a vector of URLs
page_nums <- 1:10
cr_urls <- str_glue("https://www.thecornellreview.org/page/{page_nums}/")
cr_urls
# map function over URLs
cr_reviews <- map(.x = cr_urls, .f = scrape_review, .progress = TRUE) |>
list_rbind()
# write data
write_csv(x = cr_reviews, file = "data/cornell-review-all.csv")
Part 3: Data analysis
Demo: Import the scraped data set.
cr_reviews <- read_csv(file = "data/cornell-review-all.csv")
Rows: 100 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): title, author, topic, description, url
date (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cr_reviews
# A tibble: 100 × 6
title author date topic description url
<chr> <chr> <date> <chr> <chr> <chr>
1 Playing the Race Card Revie… 2024-10-07 "Cam… CML and BS… http…
2 Should Joel Malina Be Fired? Revie… 2024-10-07 "Bey… Cornell’s … http…
3 Cornell Drops in 2025 FIRE Free Sp… Revie… 2024-10-03 "Cam… Each year,… http…
4 Interim Expressive Activity Policy… Revie… 2024-10-02 "Cor… On October… http…
5 Daryl Davis To Speak on Race Relat… Revie… 2024-10-01 "Cam… Daryl Davi… http…
6 Happy 100th Birthday, President Ca… Revie… 2024-10-01 "Bey… President … http…
7 Kavita Bala Named Cornell Provost Revie… 2024-09-25 "Cam… On Septemb… http…
8 Ithaca Labor News Revie… 2024-09-25 "Ith… Here are t… http…
9 CML Realizes It Overstepped Social… Revie… 2024-09-25 "Cam… On Wednesd… http…
10 Cornell Republicans to Host Ben Sh… Revie… 2024-09-24 "Ith… On Monday,… http…
# ℹ 90 more rows
Demo: Who are the most prolific authors?
Demo: What topics does The Cornell Review write about?
Not super helpful. Each article can have multiple topics. What is the syntax for this column?
cr_reviews |>
select(topic)
# A tibble: 100 × 1
topic
<chr>
1 "Campus"
2 "Beyond Cayuga's Waters"
3 "Campus"
4 "Cornell Politics"
5 "Campus"
6 "Beyond Cayuga's Waters\nUncategorized"
7 "Campus"
8 "Ithaca"
9 "Campus"
10 "Ithaca\nPolitics"
# ℹ 90 more rows
Each topic is separated by a "\n"
. Since the number of topics varies for each article, we should separate_longer_delim()
this column. Instead we can use a stringr function to split them into distinct character strings.
cr_reviews |>
separate_longer_delim(
cols = topic,
delim = "\n"
)
# A tibble: 133 × 6
title author date topic description url
<chr> <chr> <date> <chr> <chr> <chr>
1 Playing the Race Card Revie… 2024-10-07 Camp… CML and BS… http…
2 Should Joel Malina Be Fired? Revie… 2024-10-07 Beyo… Cornell’s … http…
3 Cornell Drops in 2025 FIRE Free Sp… Revie… 2024-10-03 Camp… Each year,… http…
4 Interim Expressive Activity Policy… Revie… 2024-10-02 Corn… On October… http…
5 Daryl Davis To Speak on Race Relat… Revie… 2024-10-01 Camp… Daryl Davi… http…
6 Happy 100th Birthday, President Ca… Revie… 2024-10-01 Beyo… President … http…
7 Happy 100th Birthday, President Ca… Revie… 2024-10-01 Unca… President … http…
8 Kavita Bala Named Cornell Provost Revie… 2024-09-25 Camp… On Septemb… http…
9 Ithaca Labor News Revie… 2024-09-25 Itha… Here are t… http…
10 CML Realizes It Overstepped Social… Revie… 2024-09-25 Camp… On Wednesd… http…
# ℹ 123 more rows
Notice the data frame now has additional rows. The unit of analysis is now an article-topic combination, rather than one-row-per-article. Not entirely a tidy structure, but necessary to construct a chart to visualize topic frequency.
cr_reviews |>
separate_longer_delim(
cols = topic,
delim = "\n"
) |>
ggplot(mapping = aes(y = topic)) +
geom_bar()
Let’s clean this up like the previous chart.
cr_reviews |>
separate_longer_delim(
cols = topic,
delim = "\n"
) |>
mutate(topic = fct_infreq(f = topic) |>
fct_rev()) |>
ggplot(mapping = aes(y = topic)) +
geom_bar()
Acknowledgments
- Part 1 is derived from From R User to R Programmer and licensed under CC BY 4.0.
sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.4.2 (2024-10-31)
os macOS Sonoma 14.6.1
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/New_York
date 2025-03-07
pandoc 3.4 @ /usr/local/bin/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
archive 1.1.9 2024-09-12 [1] CRAN (R 4.4.1)
bit 4.0.5 2022-11-15 [1] CRAN (R 4.3.0)
bit64 4.0.5 2020-08-30 [1] CRAN (R 4.3.0)
chromote 0.2.0 2024-02-12 [1] CRAN (R 4.4.0)
cli 3.6.3 2024-06-21 [1] CRAN (R 4.4.0)
crayon 1.5.3 2024-06-20 [1] CRAN (R 4.4.0)
dichromat 2.0-0.1 2022-05-02 [1] CRAN (R 4.3.0)
digest 0.6.37 2024-08-19 [1] CRAN (R 4.4.1)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
evaluate 1.0.3 2025-01-10 [1] CRAN (R 4.4.1)
farver 2.1.2 2024-05-13 [1] CRAN (R 4.3.3)
fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.0)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1)
glue 1.8.0 2024-09-30 [1] CRAN (R 4.4.1)
gtable 0.3.6 2024-10-25 [1] CRAN (R 4.4.1)
here 1.0.1 2020-12-13 [1] CRAN (R 4.3.0)
hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0)
htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.3.1)
httr 1.4.7 2023-08-15 [1] CRAN (R 4.3.0)
jsonlite 1.8.9 2024-09-20 [1] CRAN (R 4.4.1)
knitr 1.49 2024-11-08 [1] CRAN (R 4.4.1)
labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0)
later 1.4.1 2024-11-27 [1] CRAN (R 4.4.1)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1)
lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.1)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
palmerpenguins * 0.1.1 2022-08-15 [1] CRAN (R 4.3.0)
pillar 1.10.1 2025-01-07 [1] CRAN (R 4.4.1)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
processx 3.8.4 2024-03-16 [1] CRAN (R 4.3.1)
promises 1.3.2 2024-11-28 [1] CRAN (R 4.4.1)
ps 1.8.1 2024-10-28 [1] CRAN (R 4.4.1)
purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.3.0)
Rcpp 1.0.14 2025-01-12 [1] CRAN (R 4.4.1)
readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.1)
rlang 1.1.5 2025-01-17 [1] CRAN (R 4.4.1)
rmarkdown 2.29 2024-11-04 [1] CRAN (R 4.4.1)
robotstxt * 0.7.13 2020-09-03 [1] CRAN (R 4.3.0)
rprojroot 2.0.4 2023-11-05 [1] CRAN (R 4.3.1)
rvest * 1.0.4 2024-02-12 [1] CRAN (R 4.3.1)
scales 1.3.0.9000 2024-11-14 [1] Github (r-lib/scales@ee03582)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
stringi 1.8.4 2024-05-06 [1] CRAN (R 4.3.1)
stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.1)
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.1)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1)
tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.0)
timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.1)
tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.0)
utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1)
vroom 1.6.5 2023-12-05 [1] CRAN (R 4.3.1)
websocket 1.4.1 2021-08-18 [1] CRAN (R 4.3.0)
withr 3.0.2 2024-10-28 [1] CRAN (R 4.4.1)
xfun 0.50.5 2025-01-15 [1] https://yihui.r-universe.dev (R 4.4.2)
xml2 1.3.6 2023-12-04 [1] CRAN (R 4.3.1)
yaml 2.3.10 2024-07-26 [1] CRAN (R 4.4.0)
[1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
──────────────────────────────────────────────────────────────────────────────