Scatter Plot

Simply focusing on creating a production ready scatter plot, without bothering about the interpretation. Source code can be obtained here

library(tidyr)
library(dplyr)
library(readr)
library(scales)
library(ggplot2) 
setwd("/Users/ethen/Business-Analytics/articles/nyt_scatter")


file <- "nytimes_vote.tsv"
if( !file.exists(file) ) {
    url <- "https://static01.nyt.com/newsgraphics/2016/04/21/undervote/ad8bd3e44231c1091e75621b9f27fe31d116999f/data.tsv"
    download.file(url, file)
}

# convert to more informative column name
df <- read_tsv(file)
df <- df %>% rename("Someone else" = undervt, 
                    "Hilary Clinton" = clintonpct, 
                    "Bernie Sanders" = sanderspct)

df <- gather( select(df, -tvotes), party, pct, -ratio, -fips )
df <- arrange(df, fips, ratio)
df <- df %>% mutate( 
    party = factor( party, levels = c("Hilary Clinton", "Bernie Sanders", "Someone else") )
)
df
## # A tibble: 1,461 × 4
##     fips     ratio          party        pct
##    <int>     <dbl>         <fctr>      <dbl>
## 1   4001 0.8504684 Hilary Clinton 0.66388184
## 2   4001 0.8504684 Bernie Sanders 0.28837834
## 3   4001 0.8504684   Someone else 0.04773982
## 4   4003 0.7078563 Hilary Clinton 0.56740814
## 5   4003 0.7078563 Bernie Sanders 0.39412560
## 6   4003 0.7078563   Someone else 0.03846626
## 7   4005 0.6586770 Hilary Clinton 0.44145253
## 8   4005 0.6586770 Bernie Sanders 0.53400523
## 9   4005 0.6586770   Someone else 0.02454224
## 10  4007 0.9009450 Hilary Clinton 0.59508695
## # ... with 1,451 more rows
# 1. point shape = 21 is a doughnut circle
# 2. scale_fill_manual( name = "" ) gets rid of the legend title
# 3. theme's legend.key control the boxes around the legend's shape
fill_color <- c( 
    "Hilary Clinton" = "#5fa0d6",
    "Bernie Sanders" = "#83BC57",
    "Someone else" = "#d65454" 
)

ggplot( df, aes(x = ratio, y = pct) ) + 
geom_point( aes(fill = party), size = 3, alpha = 0.8, color = "white", shape = 21 ) + 
scale_fill_manual(name = "", values = fill_color) +
theme_bw( base_family = "Arial Narrow" ) + 
scale_y_continuous( label = percent, limits = c(0, 1.05) ) +
scale_x_continuous( limits = c(0, 4.5), breaks = seq(0, 4.5, 0.5) ) +
geom_text( data = data.frame(label = "↑ Share of 2016 primary vote"),
           aes(x = 0, y = 1, label = label), vjust = -1, hjust = 0, size = 3,
           fontface = "bold", family = "Arial Narrow" ) + 
labs( x = "Ratio of registered Democrats to Obama voters →", 
      y = NULL, title = "The Kinds of Places Sanders Beats Clinton",
      subtitle = "Each dot on this chart represents the share of a county's vote for a candidate in the 2016 Democratic primary" ) +
theme( legend.key = element_blank(), 
       legend.position = "top",
       plot.title = element_text(face = "bold"),
       axis.ticks = element_blank(),
       axis.text = element_text(size = 8),
       axis.title.x = element_text(hjust = 1, face = "bold", size = 9),
       panel.grid.minor = element_blank(),
       panel.grid.major = element_line(linetype = "dotted", size = 0.5),
       panel.border = element_blank(),
       plot.margin = margin(t = 10, r = 10, b = 10, l = 10) )

R Session Information

devtools::session_info()
## Session info --------------------------------------------------------------
##  setting  value                       
##  version  R version 3.2.4 (2016-03-10)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2016-12-28
## Packages ------------------------------------------------------------------
##  package    * version date       source        
##  assertthat   0.1     2013-12-06 CRAN (R 3.2.0)
##  bookdown     0.1     2016-07-13 CRAN (R 3.2.5)
##  colorspace   1.2-6   2015-03-11 CRAN (R 3.2.0)
##  DBI          0.4-1   2016-05-08 CRAN (R 3.2.5)
##  devtools     1.12.0  2016-06-24 CRAN (R 3.2.5)
##  digest       0.6.9   2016-01-08 CRAN (R 3.2.3)
##  dplyr      * 0.5.0   2016-06-24 CRAN (R 3.2.5)
##  evaluate     0.9     2016-04-29 cran (@0.9)   
##  formatR      1.4     2016-05-09 cran (@1.4)   
##  ggplot2    * 2.2.0   2016-11-11 CRAN (R 3.2.5)
##  gtable       0.2.0   2016-02-26 CRAN (R 3.2.3)
##  highr        0.6     2016-05-09 cran (@0.6)   
##  htmltools    0.3.5   2016-03-21 CRAN (R 3.2.4)
##  httpuv       1.3.3   2015-08-04 CRAN (R 3.2.0)
##  knitr        1.14    2016-08-13 CRAN (R 3.2.4)
##  labeling     0.3     2014-08-23 CRAN (R 3.2.0)
##  lazyeval     0.2.0   2016-06-12 CRAN (R 3.2.5)
##  magrittr     1.5     2014-11-22 CRAN (R 3.2.0)
##  memoise      1.0.0   2016-01-29 CRAN (R 3.2.3)
##  mime         0.4     2015-09-03 CRAN (R 3.2.0)
##  miniUI       0.1.1   2016-01-15 CRAN (R 3.2.3)
##  munsell      0.4.3   2016-02-13 CRAN (R 3.2.3)
##  plyr         1.8.4   2016-06-08 cran (@1.8.4) 
##  questionr    0.5     2016-03-15 CRAN (R 3.2.4)
##  R6           2.1.2   2016-01-26 CRAN (R 3.2.3)
##  Rcpp         0.12.5  2016-05-14 cran (@0.12.5)
##  readr      * 0.2.2   2015-10-22 CRAN (R 3.2.0)
##  rmarkdown    1.1     2016-10-16 CRAN (R 3.2.4)
##  rmdformats   0.3     2016-09-05 CRAN (R 3.2.5)
##  rstudioapi   0.6     2016-06-27 CRAN (R 3.2.5)
##  scales     * 0.4.1   2016-11-09 CRAN (R 3.2.5)
##  shiny        0.13.2  2016-03-28 CRAN (R 3.2.4)
##  stringi      1.0-1   2015-10-22 CRAN (R 3.2.0)
##  stringr      1.0.0   2015-04-30 CRAN (R 3.2.0)
##  tibble       1.2     2016-08-26 CRAN (R 3.2.5)
##  tidyr      * 0.5.1   2016-06-14 CRAN (R 3.2.5)
##  withr        1.0.1   2016-02-04 CRAN (R 3.2.3)
##  xtable       1.8-2   2016-02-05 CRAN (R 3.2.3)
##  yaml         2.1.13  2014-06-12 CRAN (R 3.2.0)

Reference

Ethen Liu

2016-12-28