Daily Exercise 07

ESS 330 - Quantitative Reasoning

# 20 Feburary 2025
# ESS 330 - Daily Assignment 07
# This file is a copy of the R script for assignment 07 where the COVID-19 data from 
# assignment 06 is visualized.

## COVID-19 Data

### Data

# We are going to practice some data wrangling skills using a real-world 
# data set about COVID cases curated and maintained by the New York Times. The
# data are archived on a GitHub repo [here](https://github.com/nytimes/covid-19-data). 

#Prepare libraries
library(tidyverse)
Warning: package 'lubridate' was built under R version 4.4.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

# Read-in and store NY-Times data
url <- 'https://raw.githubusercontent.com/nytimes/covid-19-data/refs/heads/master/us-states.csv'
covid <- read_csv(url)
Rows: 61942 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): state, fips
dbl  (2): cases, deaths
date (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### Question 1

# Find the most recent date
max_date <- max(covid$date)
most_recent_data <- filter(covid, date == max_date)

# Get the 6 states with the most cases
top_6_states <- most_recent_data %>%
  arrange(desc(cases)) %>% 
  slice(1:6) %>% 
  pull(state)

# Filter data to the top 6 states
top_6_states_data <- covid %>% 
  filter(state %in% top_6_states) %>% 
  group_by(state, date) %>% 
  summarize(state_cases = sum(cases, na.rm = TRUE))
`summarise()` has grouped output by 'state'. You can override using the
`.groups` argument.
# Set up a gg plot
top_6_states_plot = ggplot(top_6_states_data, aes(x = date, y = state_cases, group = state, color = state)) +
  geom_line() +
  labs(
    title = "Cumulative Case Counts: COVID-19 Pandemic",
    caption = "Based on NY-Times COVID-19 Data.",
    x = "Date",
    y = "Number of Cases",
    color = "State"
  ) +
  facet_wrap(~state) + 
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    legend.position = "none"
  ) + 
  scale_x_date(date_breaks = "8 month", date_labels = "%b %y")

# Echo plot to dev env
top_6_states_plot

# Save the top_6_states_plot as an image
#ggsave(top_6_states_plot, 
      # file = "images\\top_6_state_cases_plot.jpg", 
      # width = 10,
      # height = 6,
      # units = c("in"))

### Question 2

# Find daily total cases in the US
us_daily_cases_data <- covid %>% 
  group_by(date) %>% 
  summarize(us_cases = sum(cases, na.rm = TRUE))

# Plot us_daily_cases_data
us_daily_cases_plot = ggplot(us_daily_cases_data, aes(x = date, y = us_cases)) + 
  geom_col(color = "darkred") + 
  labs(
    title = "Cumulative Case Counts: US COVID-19 Pandemic",
    caption = "Based on NY-Times COVID-19 Data.",
    author = "Zachary Cramton",
    x = "Date",
    y = "Number of Cases") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_x_date(date_breaks = "8 month", date_labels = "%b %y")

# Echo the plot to dev env
us_daily_cases_plot

# Save the us_daily_plot as an image
#ggsave(us_daily_cases_plot,
      # file = "images\\us_daily_cases_plot.jpg",
      # width = 10,
      # height = 10,
      # units = c("in"))