Lecture Notes: Data Wrangling Part II

Demonstration

We will continue with Worksheet A-2.

library(tidyverse)
library(gapminder)
gapminder
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows

Demo 1

Compute a weighted mean of life expectancy for different years, weighting by population.

gapminder %>% 
  group_by(year) %>% 
  mutate(wmean = sum(lifeExp * pop) / sum(pop))
## # A tibble: 1,704 x 7
## # Groups:   year [12]
##    country     continent  year lifeExp      pop gdpPercap wmean
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl> <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.  48.9
##  2 Afghanistan Asia       1957    30.3  9240934      821.  52.1
##  3 Afghanistan Asia       1962    32.0 10267083      853.  52.3
##  4 Afghanistan Asia       1967    34.0 11537966      836.  57.0
##  5 Afghanistan Asia       1972    36.1 13079460      740.  59.5
##  6 Afghanistan Asia       1977    38.4 14880372      786.  61.2
##  7 Afghanistan Asia       1982    39.9 12881816      978.  62.9
##  8 Afghanistan Asia       1987    40.8 13867957      852.  64.4
##  9 Afghanistan Asia       1992    41.7 16317921      649.  65.6
## 10 Afghanistan Asia       1997    41.8 22227415      635.  66.8
## # … with 1,694 more rows

What if I wanted those values repeated for each line?

Demo 2

For each continent and year, how many countries have a “low” life expectancy? Let’s say life expectancy is “low” if it’s less than the median most recent life expectancy for each continent.

v <- c(4, 7, 2, 4, 5, 9)
v[v < 6]
## [1] 4 2 4 5
v < 6
## [1]  TRUE FALSE  TRUE  TRUE  TRUE FALSE
v[1:2]
## [1] 4 7
gapminder %>% 
  group_by(continent) %>% 
  mutate(median = median(lifeExp[year == 2007])) %>% 
  filter(lifeExp < median) %>% 
  count(continent, year)
## # A tibble: 60 x 3
## # Groups:   continent [5]
##    continent  year     n
##    <fct>     <int> <int>
##  1 Africa     1952    52
##  2 Africa     1957    50
##  3 Africa     1962    50
##  4 Africa     1967    47
##  5 Africa     1972    41
##  6 Africa     1977    38
##  7 Africa     1982    29
##  8 Africa     1987    28
##  9 Africa     1992    27
## 10 Africa     1997    26
## # … with 50 more rows
  # group_by(continent, year) %>% 
  # summarise(n = n())

gapminder %>% 
  mutate(asia = continent == "Asia")
## # A tibble: 1,704 x 7
##    country     continent  year lifeExp      pop gdpPercap asia 
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl> <lgl>
##  1 Afghanistan Asia       1952    28.8  8425333      779. TRUE 
##  2 Afghanistan Asia       1957    30.3  9240934      821. TRUE 
##  3 Afghanistan Asia       1962    32.0 10267083      853. TRUE 
##  4 Afghanistan Asia       1967    34.0 11537966      836. TRUE 
##  5 Afghanistan Asia       1972    36.1 13079460      740. TRUE 
##  6 Afghanistan Asia       1977    38.4 14880372      786. TRUE 
##  7 Afghanistan Asia       1982    39.9 12881816      978. TRUE 
##  8 Afghanistan Asia       1987    40.8 13867957      852. TRUE 
##  9 Afghanistan Asia       1992    41.7 16317921      649. TRUE 
## 10 Afghanistan Asia       1997    41.8 22227415      635. TRUE 
## # … with 1,694 more rows

Demo 3

Get the maximum and minimum gdpPercap for all continents.

gapminder %>% 
  group_by(continent) %>% 
  summarise(max_g = max(gdpPercap),
            min_g = min(gdpPercap)) %>% 
  group_by(continent == "Asia") %>% 
  summarise(max2 = max(max_g))
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##   `continent == "Asia"`    max2
##   <lgl>                   <dbl>
## 1 FALSE                  49357.
## 2 TRUE                  113523.

Demo 4

Filter gapminder to all entries that have experienced a drop in life expectancy. Hint: dplyr::lag().

v <- 1:4
v
## [1] 1 2 3 4
dplyr::lag(v)
## [1] NA  1  2  3
gapminder %>% 
  group_by(country) %>% 
  arrange(year) %>% 
  mutate(inc = lifeExp - lag(lifeExp)) %>% 
  filter(inc < 0)
## # A tibble: 102 x 7
## # Groups:   country [52]
##    country         continent  year lifeExp       pop gdpPercap     inc
##    <fct>           <fct>     <int>   <dbl>     <int>     <dbl>   <dbl>
##  1 China           Asia       1962    44.5 665770000      488. -6.05  
##  2 Cambodia        Asia       1972    40.3   7450606      422. -5.10  
##  3 Czech Republic  Europe     1972    70.3   9862158    13108. -0.0900
##  4 Netherlands     Europe     1972    73.8  13329874    18795. -0.0700
##  5 Slovak Republic Europe     1972    70.4   4593433     9674. -0.63  
##  6 Bulgaria        Europe     1977    70.8   8797022     7612. -0.09  
##  7 Cambodia        Asia       1977    31.2   6978607      525. -9.10  
##  8 El Salvador     Americas   1977    56.7   4282586     5139. -1.51  
##  9 Poland          Europe     1977    70.7  34621254     9508. -0.180 
## 10 Uganda          Africa     1977    50.4  11457758      844. -0.666 
## # … with 92 more rows

Demo 5

Filter gapminder to contain six rows: the rows with the three largest GDP per capita, and the rows with the three smallest GDP per capita. Don’t make any excerpts.

gap1 <- filter(gapminder, continent == "Europe")
gap2 <- summarise(gap1, mean_gdp = mean(gdpPercap))
gap2
## # A tibble: 1 x 1
##   mean_gdp
##      <dbl>
## 1   14469.
nrow(gapminder)
## [1] 1704
gapminder %>% 
  arrange(gdpPercap) %>% 
  slice(c(1   , 2   , 3, 1702, 1703 ,1704)
    #c(1:3, (n() - 2):n())
  )
## # A tibble: 6 x 6
##   country          continent  year lifeExp      pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Congo, Dem. Rep. Africa     2002    45.0 55379852      241.
## 2 Congo, Dem. Rep. Africa     2007    46.5 64606759      278.
## 3 Lesotho          Africa     1952    42.1   748747      299.
## 4 Kuwait           Asia       1952    55.6   160000   108382.
## 5 Kuwait           Asia       1972    67.7   841934   109348.
## 6 Kuwait           Asia       1957    58.0   212846   113523.
c(1:3, (1704 - 2):1704)
## [1]    1    2    3 1702 1703 1704
c(1:3, 1702:1704)
## [1]    1    2    3 1702 1703 1704
gapminder %>% 
  group_by(continent) %>% 
  mutate(n_function = n())
## # A tibble: 1,704 x 7
## # Groups:   continent [5]
##    country     continent  year lifeExp      pop gdpPercap n_function
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>      <int>
##  1 Afghanistan Asia       1952    28.8  8425333      779.        396
##  2 Afghanistan Asia       1957    30.3  9240934      821.        396
##  3 Afghanistan Asia       1962    32.0 10267083      853.        396
##  4 Afghanistan Asia       1967    34.0 11537966      836.        396
##  5 Afghanistan Asia       1972    36.1 13079460      740.        396
##  6 Afghanistan Asia       1977    38.4 14880372      786.        396
##  7 Afghanistan Asia       1982    39.9 12881816      978.        396
##  8 Afghanistan Asia       1987    40.8 13867957      852.        396
##  9 Afghanistan Asia       1992    41.7 16317921      649.        396
## 10 Afghanistan Asia       1997    41.8 22227415      635.        396
## # … with 1,694 more rows
gapminder %>% 
  mutate(decade = floor(year / 10)) %>% 
  group_by(country, continent, decade) %>% 
  summarise(across(where(is.numeric), mean)) %>% 
  select(-year)
## `summarise()` regrouping output by 'country', 'continent' (override with `.groups` argument)
## # A tibble: 852 x 6
## # Groups:   country, continent [142]
##    country     continent decade lifeExp       pop gdpPercap
##    <fct>       <fct>      <dbl>   <dbl>     <dbl>     <dbl>
##  1 Afghanistan Asia         195    29.6  8833134.      800.
##  2 Afghanistan Asia         196    33.0 10902524.      845.
##  3 Afghanistan Asia         197    37.3 13979916       763.
##  4 Afghanistan Asia         198    40.3 13374886.      915.
##  5 Afghanistan Asia         199    41.7 19272668       642.
##  6 Afghanistan Asia         200    43.0 28579164       851.
##  7 Albania     Europe       195    57.3  1379601      1772.
##  8 Albania     Europe       196    65.5  1856098.     2537.
##  9 Albania     Europe       197    68.3  2386301      3423.
## 10 Albania     Europe       198    71.2  2927709      3685.
## # … with 842 more rows