AE 11: Iterating in R

Packages

We will use the following packages in this application exercise.

{tidyverse}: For data import, wrangling, and visualization.
{palmerpenguins}: For the penguins dataset
{rvest}: For scraping HTML files.
{robotstxt}: For verifying if we can scrape a website.

library(tidyverse)
library(palmerpenguins)
library(rvest)
library(robotstxt)

Part 1: Iterating over columns

Your turn: Write a function that summarizes multiple specified columns of a data frame and calculates their arithmetic mean and standard deviation using across().

# simple version
my_summary <- function(df, cols) {
  df |>
    summarize(
      across(
        .cols = {{ cols }},
        .fns = list(
          mean = \(x) mean(x, na.rm = TRUE),
          sd = \(x) sd(x, na.rm = TRUE)
        )
      ),
      .groups = "drop"
    )
}

penguins |>
  group_by(species) |>
  my_summary(ends_with("mm"))

# A tibble: 3 × 7
  species   bill_length_mm_mean bill_length_mm_sd bill_depth_mm_mean
  <fct>                   <dbl>             <dbl>              <dbl>
1 Adelie                   38.8              2.66               18.3
2 Chinstrap                48.8              3.34               18.4
3 Gentoo                   47.5              3.08               15.0
# ℹ 3 more variables: bill_depth_mm_sd <dbl>, flipper_length_mm_mean <dbl>,
#   flipper_length_mm_sd <dbl>

# include a default set of columns
my_summary <- function(df, cols = where(is.numeric)) {
  df |>
    summarize(
      across(
        .cols = {{ cols }},
        .fns = list(
          mean = \(x) mean(x, na.rm = TRUE),
          sd = \(x) sd(x, na.rm = TRUE)
        )
      ),
      .groups = "drop"
    )
}

penguins |>
  select(-year) |>
  my_summary()

# A tibble: 1 × 8
  bill_length_mm_mean bill_length_mm_sd bill_depth_mm_mean bill_depth_mm_sd
                <dbl>             <dbl>              <dbl>            <dbl>
1                43.9              5.46               17.2             1.97
# ℹ 4 more variables: flipper_length_mm_mean <dbl>, flipper_length_mm_sd <dbl>,
#   body_mass_g_mean <dbl>, body_mass_g_sd <dbl>

Part 2: Data scraping

See the code below stored in iterate-cornell-review.R.

# load packages
library(tidyverse)
library(rvest)
library(robotstxt)

# check that we can scrape data from the cornell review
paths_allowed("https://www.thecornellreview.org/")

# read the first page
page <- read_html("https://www.thecornellreview.org/")

# extract desired components
titles <- html_elements(x = page, css = "#main .read-title a") |>
  html_text2()

authors <- html_elements(x = page, css = "#main .byline a") |>
  html_text2()

article_dates <- html_elements(x = page, css = "#main .posts-date") |>
  html_text2()

topics <- html_elements(x = page, css = "#main .cat-links") |>
  html_text2()

abstracts <- html_elements(x = page, css = ".post-description") |>
  html_text2()

post_urls <- html_elements(x = page, css = ".aft-readmore") |>
  html_attr(name = "href")

# create a tibble with this data
review_raw <- tibble(
  title = titles,
  author = authors,
  date = article_dates,
  topic = topics,
  description = abstracts,
  url = post_urls
)

# clean up the data
review <- review_raw |>
  mutate(
    date = mdy(date),
    description = str_remove(string = description, pattern = "\nRead More")
  )

######## write a function to scrape a single page and use a map() function
######## to iterate over the first ten pages
# convert to a function
scrape_review <- function(url){
  # pause for a couple of seconds to prevent rapid HTTP requests
  Sys.sleep(2)

  # read the first page
  page <- read_html(url)

  # extract desired components
  titles <- html_elements(x = page, css = "#main .read-title a") |>
    html_text2()

  authors <- html_elements(x = page, css = "#main .byline a") |>
    html_text2()

  article_dates <- html_elements(x = page, css = "#main .posts-date") |>
    html_text2()

  topics <- html_elements(x = page, css = "#main .cat-links") |>
    html_text2()

  abstracts <- html_elements(x = page, css = ".post-description") |>
    html_text2()

  post_urls <- html_elements(x = page, css = ".aft-readmore") |>
    html_attr(name = "href")

  # create a tibble with this data
  review_raw <- tibble(
    title = titles,
    author = authors,
    date = article_dates,
    topic = topics,
    description = abstracts,
    url = post_urls
  )

  # clean up the data
  review <- review_raw |>
    mutate(
      date = mdy(date),
      description = str_remove(string = description, pattern = "\nRead More")
    )

  # export the resulting data frame
  return(review)
}

# test function
## page 1
scrape_review(url = "https://www.thecornellreview.org/page/1/")

## page 2
scrape_review(url = "https://www.thecornellreview.org/page/2/")

# create a vector of URLs
page_nums <- 1:10
cr_urls <- str_glue("https://www.thecornellreview.org/page/{page_nums}/")
cr_urls

# map function over URLs
cr_reviews <- map(.x = cr_urls, .f = scrape_review, .progress = TRUE) |>
  list_rbind()

# write data
write_csv(x = cr_reviews, file = "data/cornell-review-all.csv")

Part 3: Data analysis

Demo: Import the scraped data set.

cr_reviews <- read_csv(file = "data/cornell-review-all.csv")

Rows: 100 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (5): title, author, topic, description, url
date (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cr_reviews

# A tibble: 100 × 6
   title                               author date       topic description url  
   <chr>                               <chr>  <date>     <chr> <chr>       <chr>
 1 Playing the Race Card               Revie… 2024-10-07 "Cam… CML and BS… http…
 2 Should Joel Malina Be Fired?        Revie… 2024-10-07 "Bey… Cornell’s … http…
 3 Cornell Drops in 2025 FIRE Free Sp… Revie… 2024-10-03 "Cam… Each year,… http…
 4 Interim Expressive Activity Policy… Revie… 2024-10-02 "Cor… On October… http…
 5 Daryl Davis To Speak on Race Relat… Revie… 2024-10-01 "Cam… Daryl Davi… http…
 6 Happy 100th Birthday, President Ca… Revie… 2024-10-01 "Bey… President … http…
 7 Kavita Bala Named Cornell Provost   Revie… 2024-09-25 "Cam… On Septemb… http…
 8 Ithaca Labor News                   Revie… 2024-09-25 "Ith… Here are t… http…
 9 CML Realizes It Overstepped Social… Revie… 2024-09-25 "Cam… On Wednesd… http…
10 Cornell Republicans to Host Ben Sh… Revie… 2024-09-24 "Ith… On Monday,… http…
# ℹ 90 more rows

Demo: Who are the most prolific authors?

cr_reviews |>
  # adjust order of authors so they appear from most to least frequent
  mutate(author = fct_infreq(f = author) |>
    fct_rev()) |>
  # horizontal bar chart
  ggplot(mapping = aes(y = author)) +
  geom_bar()

Demo: What topics does The Cornell Review write about?

# basic bar plot
ggplot(data = cr_reviews, mapping = aes(y = topic)) +
  geom_bar()

Not super helpful. Each article can have multiple topics. What is the syntax for this column?

cr_reviews |>
  select(topic)

# A tibble: 100 × 1
   topic                                  
   <chr>                                  
 1 "Campus"                               
 2 "Beyond Cayuga's Waters"               
 3 "Campus"                               
 4 "Cornell Politics"                     
 5 "Campus"                               
 6 "Beyond Cayuga's Waters\nUncategorized"
 7 "Campus"                               
 8 "Ithaca"                               
 9 "Campus"                               
10 "Ithaca\nPolitics"                     
# ℹ 90 more rows

Each topic is separated by a "\n". Since the number of topics varies for each article, we should separate_longer_delim() this column. Instead we can use a stringr function to split them into distinct character strings.

cr_reviews |>
  separate_longer_delim(
    cols = topic,
    delim = "\n"
  )

# A tibble: 133 × 6
   title                               author date       topic description url  
   <chr>                               <chr>  <date>     <chr> <chr>       <chr>
 1 Playing the Race Card               Revie… 2024-10-07 Camp… CML and BS… http…
 2 Should Joel Malina Be Fired?        Revie… 2024-10-07 Beyo… Cornell’s … http…
 3 Cornell Drops in 2025 FIRE Free Sp… Revie… 2024-10-03 Camp… Each year,… http…
 4 Interim Expressive Activity Policy… Revie… 2024-10-02 Corn… On October… http…
 5 Daryl Davis To Speak on Race Relat… Revie… 2024-10-01 Camp… Daryl Davi… http…
 6 Happy 100th Birthday, President Ca… Revie… 2024-10-01 Beyo… President … http…
 7 Happy 100th Birthday, President Ca… Revie… 2024-10-01 Unca… President … http…
 8 Kavita Bala Named Cornell Provost   Revie… 2024-09-25 Camp… On Septemb… http…
 9 Ithaca Labor News                   Revie… 2024-09-25 Itha… Here are t… http…
10 CML Realizes It Overstepped Social… Revie… 2024-09-25 Camp… On Wednesd… http…
# ℹ 123 more rows

Notice the data frame now has additional rows. The unit of analysis is now an article-topic combination, rather than one-row-per-article. Not entirely a tidy structure, but necessary to construct a chart to visualize topic frequency.

cr_reviews |>
  separate_longer_delim(
    cols = topic,
    delim = "\n"
  ) |>
  ggplot(mapping = aes(y = topic)) +
  geom_bar()

Let’s clean this up like the previous chart.

cr_reviews |>
  separate_longer_delim(
    cols = topic,
    delim = "\n"
  ) |>
  mutate(topic = fct_infreq(f = topic) |>
    fct_rev()) |>
  ggplot(mapping = aes(y = topic)) +
  geom_bar()

Acknowledgments

Part 1 is derived from From R User to R Programmer and licensed under CC BY 4.0.

Session information

sessioninfo::session_info()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.2 (2024-10-31)
 os       macOS Sonoma 14.6.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2025-03-07
 pandoc   3.4 @ /usr/local/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package        * version    date (UTC) lib source
 archive          1.1.9      2024-09-12 [1] CRAN (R 4.4.1)
 bit              4.0.5      2022-11-15 [1] CRAN (R 4.3.0)
 bit64            4.0.5      2020-08-30 [1] CRAN (R 4.3.0)
 chromote         0.2.0      2024-02-12 [1] CRAN (R 4.4.0)
 cli              3.6.3      2024-06-21 [1] CRAN (R 4.4.0)
 crayon           1.5.3      2024-06-20 [1] CRAN (R 4.4.0)
 dichromat        2.0-0.1    2022-05-02 [1] CRAN (R 4.3.0)
 digest           0.6.37     2024-08-19 [1] CRAN (R 4.4.1)
 dplyr          * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
 evaluate         1.0.3      2025-01-10 [1] CRAN (R 4.4.1)
 farver           2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
 fastmap          1.2.0      2024-05-15 [1] CRAN (R 4.4.0)
 forcats        * 1.0.0      2023-01-29 [1] CRAN (R 4.3.0)
 generics         0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2        * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
 glue             1.8.0      2024-09-30 [1] CRAN (R 4.4.1)
 gtable           0.3.6      2024-10-25 [1] CRAN (R 4.4.1)
 here             1.0.1      2020-12-13 [1] CRAN (R 4.3.0)
 hms              1.1.3      2023-03-21 [1] CRAN (R 4.3.0)
 htmltools        0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
 htmlwidgets      1.6.4      2023-12-06 [1] CRAN (R 4.3.1)
 httr             1.4.7      2023-08-15 [1] CRAN (R 4.3.0)
 jsonlite         1.8.9      2024-09-20 [1] CRAN (R 4.4.1)
 knitr            1.49       2024-11-08 [1] CRAN (R 4.4.1)
 labeling         0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
 later            1.4.1      2024-11-27 [1] CRAN (R 4.4.1)
 lifecycle        1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
 lubridate      * 1.9.3      2023-09-27 [1] CRAN (R 4.3.1)
 magrittr         2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
 palmerpenguins * 0.1.1      2022-08-15 [1] CRAN (R 4.3.0)
 pillar           1.10.1     2025-01-07 [1] CRAN (R 4.4.1)
 pkgconfig        2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
 processx         3.8.4      2024-03-16 [1] CRAN (R 4.3.1)
 promises         1.3.2      2024-11-28 [1] CRAN (R 4.4.1)
 ps               1.8.1      2024-10-28 [1] CRAN (R 4.4.1)
 purrr          * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
 R6               2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
 RColorBrewer     1.1-3      2022-04-03 [1] CRAN (R 4.3.0)
 Rcpp             1.0.14     2025-01-12 [1] CRAN (R 4.4.1)
 readr          * 2.1.5      2024-01-10 [1] CRAN (R 4.3.1)
 rlang            1.1.5      2025-01-17 [1] CRAN (R 4.4.1)
 rmarkdown        2.29       2024-11-04 [1] CRAN (R 4.4.1)
 robotstxt      * 0.7.13     2020-09-03 [1] CRAN (R 4.3.0)
 rprojroot        2.0.4      2023-11-05 [1] CRAN (R 4.3.1)
 rvest          * 1.0.4      2024-02-12 [1] CRAN (R 4.3.1)
 scales           1.3.0.9000 2024-11-14 [1] Github (r-lib/scales@ee03582)
 sessioninfo      1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
 stringi          1.8.4      2024-05-06 [1] CRAN (R 4.3.1)
 stringr        * 1.5.1      2023-11-14 [1] CRAN (R 4.3.1)
 tibble         * 3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
 tidyr          * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect       1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
 tidyverse      * 2.0.0      2023-02-22 [1] CRAN (R 4.3.0)
 timechange       0.3.0      2024-01-18 [1] CRAN (R 4.3.1)
 tzdb             0.4.0      2023-05-12 [1] CRAN (R 4.3.0)
 utf8             1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
 vctrs            0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
 vroom            1.6.5      2023-12-05 [1] CRAN (R 4.3.1)
 websocket        1.4.1      2021-08-18 [1] CRAN (R 4.3.0)
 withr            3.0.2      2024-10-28 [1] CRAN (R 4.4.1)
 xfun             0.50.5     2025-01-15 [1] https://yihui.r-universe.dev (R 4.4.2)
 xml2             1.3.6      2023-12-04 [1] CRAN (R 4.3.1)
 yaml             2.3.10     2024-07-26 [1] CRAN (R 4.4.0)

 [1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────