Visualizing population growth rate

Posted by Hai Vo on Tuesday, April 19, 2022

Get the data

Data came from Vietnam general statistic organization To be able to easily access the data from R. I build a simple package gsovn to scrape data from website.

library(tidyverse)
data = gsovn::gso_read(gsovn::avail_dataset$link[30])
data$data %>%
  head(10) %>% 
  knitr::kable()
2005 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 Prel. 2020
WHOLE COUNTRY 1.17 1.09 1.07 1.06 1.07 1.05 1.08 1.07 1.08 1.12 1.11 1.11 1.17 1.15 1.14
Red River Delta 0.90 0.63 1.28 0.74 1.19 1.08 1.04 1.02 1.09 1.37 1.41 1.38 1.47 1.48 1.33
Ha Noi 2.02 1.37 NA 1.41 2.50 1.93 1.54 1.63 1.70 2.03 2.11 1.99 2.23 2.27 1.89
Ha Tay 2.03 1.14 NA NA NA NA NA NA NA NA NA NA NA NA NA
Vinh Phuc 1.03 0.69 NA 0.66 0.72 0.38 1.09 0.68 1.22 1.26 1.45 1.54 1.36 1.45 1.42
Bac Ninh 0.80 0.95 0.87 0.82 1.73 1.84 2.10 2.06 2.08 3.52 3.23 3.17 3.05 3.08 2.94
Quang Ninh 1.33 1.19 1.12 0.97 0.97 0.93 0.83 0.82 1.01 1.13 1.58 1.90 1.46 1.61 0.96
Hai Duong 0.30 0.30 0.36 0.35 0.56 0.78 0.69 0.58 0.65 0.95 1.11 0.97 1.46 1.02 1.05
Hai Phong 0.89 0.98 0.98 0.89 0.94 1.19 1.29 1.11 1.08 0.96 0.80 0.81 0.75 0.83 1.00
Hung Yen 0.60 0.43 0.44 0.21 0.33 0.54 0.50 0.66 0.56 1.12 1.12 1.10 1.10 1.08 1.06

This dataset contain data of population growth (by percent) in Vietnam. The data is quite in a good a shape, but still need some cleaning

Cleaning

First I need to remove white space from provinces/cities name.

df = data$data
df = df %>% rename(region =1) %>% 
  mutate(region = str_squish(region))

Next I split up provinces/cites by region it belong to.

region = c(
  "WHOLE COUNTRY",
  "Northern midlands and mountain areas",
  "Red River Delta",
  "Northern Central area and Central coastal area",
  "Central Highlands",
  "South East",
  "Mekong River Delta" 
)


region_regex = paste0("(",str_c(region, collapse = "|"),")")

df = df %>% 
  mutate(across(-region, as.numeric)) %>% 
  mutate(group = cumsum(str_detect(region, region_regex))) %>% 
  group_by(group) %>% 
  mutate(group = first(region)) %>% 
  ungroup()

Then pivoting data to the long format

df_long = df %>% 
  pivot_longer(cols = c(-region, -group),names_to = 'year', values_to = "pop_growth_rate")
  
head(df_long,5) %>% 
  knitr::kable()
region group year pop_growth_rate
WHOLE COUNTRY WHOLE COUNTRY 2005 1.17
WHOLE COUNTRY WHOLE COUNTRY 2007 1.09
WHOLE COUNTRY WHOLE COUNTRY 2008 1.07
WHOLE COUNTRY WHOLE COUNTRY 2009 1.06
WHOLE COUNTRY WHOLE COUNTRY 2010 1.07

Prepare data for visualization

I want the audience know which provinces/cities belong to which region, so, I need the map data of Vietnam split by regions. This is quite common task for me, hence I saved the map to my personal package {haitools}. Save data this way allow me to load it even when I am not working on my computer.

library(sf)
library(ggrepel)
theme_set(theme_light())
vn_sf = haitools::small_vnsf

# fix thua thien hue, tp ho chi minh name
df_long = df_long %>% 
  mutate(region = case_when(region == "Thua Thien-Hue" ~ "Thua Thien Hue",
                            region == "Ho Chi Minh city" ~ "Ho Chi Minh",
                            TRUE ~ region))

a = map(vn_sf$geometry, ~ colMeans(chuck(.x,1,1))) 
coord_x = map_dbl(a, 1)
coord_y = map_dbl(a,2)

vn_sf = vn_sf %>% mutate(coord_x = coord_x, coord_y = coord_y)

Vietnam map by region

p1 = df_long %>%  
  distinct(region,group) %>% 
  mutate(group = fct_relevel(group, .env$region[-1] )) %>% 
  inner_join(vn_sf, by = c(region = "province")) %>% 
  ggplot(aes(geometry = geometry, fill = group))+
  geom_sf(color= "white", alpha = .5)+
  guides(fill = guide_legend(ncol = 1))+
  theme_void()+
  labs(fill = "")+
  #geom_text_repel(aes(x = coord_x, y = coord_y, label = region),max.overlaps = 20, size = 3)
  theme(legend.position =c(0.4,0.35), text = element_text(size =16))
p1

Next, I visualizing the distribution of growth rate for each region. There are a interesting package {ggdist} which draw half-eye plot for distribution, compare with the typical box-plot or violin-plot.

df_count_group = df_long %>% 
  filter(!region %in% .env$region) %>% 
  filter(year == 2019) %>% 
  count(group)

df_by_group = df_long %>% 
  filter(!region %in% .env$region) %>% 
  left_join(df_count_group, by = c("group"= "group")) %>% 
  mutate(group = fct_relevel(group, .env$region[-1])) 

df_by_group_n = df_by_group %>% distinct(group, n) %>% 
  mutate(new_name = paste0(group, " (n=", n, ")"))
new_name = df_by_group_n$new_name
names(new_name) = df_by_group_n$group

df_by_group = df_by_group %>% 
  mutate(group = fct_relabel(group, ~new_name[.] ))


df_whole = df_long %>% 
  filter(region == "WHOLE COUNTRY")

df_whole2 = df_whole %>% 
  mutate(group = map(region, ~unique(df_by_group$group))) %>% 
  unnest(group)
p2 = df_by_group %>% 
  ggplot(aes(year, pop_growth_rate, group = group, fill = group))+
  ggdist::stat_halfeye(alpha = .7)+
  geom_line(data = df_whole2,
            aes(year, pop_growth_rate, group = region),
            linetype =2, 
            inherit.aes = FALSE) +
  facet_wrap( ~ group, scales = "free",ncol = 2) +
  guides(fill = "none") +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        strip.background = element_rect(fill = "#9eba89"),
        strip.text = element_text(colour = "#2d3443", size = 18),
        #plot.background = element_rect(fill = "grey80"),
        #panel.background = (fill ="grey80"),
        text = element_text(size= 18)) +
  labs(
    y = NULL,
    x = NULL
  )

p2

At last, I combine two plot with {patchwork}

library(patchwork)
library(ggtext)
p = p1 + p2 +
  plot_annotation(
    title = "Distribution of population growth rate group by area, from 2005 to 2020",
    subtitle = "**(n)** is number of provinces in each group <br/>
                  *Dash-line* is the average of whole country",
    theme = theme(
      plot.title = element_markdown(size = 22),
      plot.subtitle = element_markdown(size = 18)
    )
  )

To save the images, use ggsave()

knitr::include_graphics("Dist-of-pop-growth.png")