Calendar Heatmaps

Source code can be obtained here

We’ll use an attacker data with 200,000 “events”. It has the timestamp, and the source_country ISO 3166-1 alpha-2 country code (which is the source of the attack) plus the tz time zone of the source IP address. Let’s have a look:

library(dplyr)
library(scales)
library(viridis)
library(ggplot2)
library(ggthemes)
library(gridExtra)
library(lubridate)
library(data.table)
setwd('/Users/ethen/Business-Analytics/articles/calendar_heatmaps')

attacks <- fread("data/eventlog.csv")
knitr::kable( head(attacks) )
timestamp source_country tz
2015-03-12T15:59:16.718901Z CN Asia/Shanghai
2015-03-12T16:00:48.841746Z FR Europe/Paris
2015-03-12T16:02:26.731256Z CN Asia/Shanghai
2015-03-12T16:02:38.469907Z US America/Chicago
2015-03-12T16:03:22.201903Z CN Asia/Shanghai
2015-03-12T16:03:45.984616Z CN Asia/Shanghai

A quick exploratory data analysis on this attacker data is what were the “working hours” by country, and we can visualize this with a calendar heatmap. To create the heatmap, we’ll need the weekday and hour of each event, or as granular as you want to get.

make_hr_wkday <- function(ts, sc, tz) {
    # convert each time with the appropriate timezone,
    # the timezone parameter, tz, only takes a single value,
    # then extract its weekdays and hour
    real_times <- ymd_hms( ts, tz = tz[1], quiet = TRUE )
    dt <- data.table( source_country = sc,
                      wkday = weekdays(real_times),
                      hour = hour(real_times) )
    return(dt)
}

# convert weekday and hour into factor so they'll be ordered when plotting
wkday_levels <- c('Sunday', 'Monday', 'Tuesday', 'Wednesday', 
                  'Thursday', 'Friday', 'Saturday')
attacks <- attacks %>% 
           group_by(tz) %>%
           do( make_hr_wkday( .$timestamp, .$source_country, .$tz ) ) %>% 
           ungroup() %>% 
           mutate( wkday = factor(wkday, levels = wkday_levels),
                   hour  = factor(hour, levels = 0:23) )

knitr::kable( head(attacks) )
tz source_country wkday hour
Africa/Cairo BG Saturday 22
Africa/Cairo TW Sunday 8
Africa/Cairo TW Sunday 10
Africa/Cairo CN Sunday 13
Africa/Cairo US Sunday 17
Africa/Cairo CA Monday 13

Then we can simply group the count by hour and wkday and plot it, since we know that we have values for every combination there’s no need to further preprocess the data.

grouped <- attacks %>% count(wkday, hour) %>% ungroup()

ggplot( grouped, aes(hour, wkday, fill = n) ) + 
geom_tile(color = "white", size = 0.1) + 
theme_tufte(base_family = "Helvetica") + 
coord_equal() + 
scale_fill_viridis(name = "# of Events", label = comma) + 
labs(x = NULL, y = NULL, title = "Events per weekday & time of day") +
theme( axis.ticks = element_blank(),
       plot.title = element_text(hjust = 0.5),
       legend.title = element_text(size = 8),
       legend.text = element_text(size = 6) )

Some ggplot2 explanation:

That’s great, but what if we wanted the heatmap breakdown by country? We’ll can do this in two ways, first with each country’s heatmap using the same scale (using facet), then with each one using it’s own scale (using grid.arrange).

events_by_country <- count( attacks, source_country ) %>% 
                     mutate( percent = percent( n / sum(n) ) ) %>%
                     arrange( desc(n) )
head(events_by_country)
## # A tibble: 6 × 3
##   source_country     n percent
##            <chr> <int>   <chr>
## 1             CN 85243   42.6%
## 2             US 48684   24.3%
## 3             KR 12648    6.3%
## 4             NL  8572    4.3%
## 5             VN  6340    3.2%
## 6             TW  3469    1.7%
top_country <- events_by_country$source_country[1:4]
top_country_attacks <- attacks %>%
                       filter( source_country %in% top_country ) %>% 
                       count( source_country, wkday, hour ) %>% 
                       ungroup() %>% 
                       mutate( source_country = factor( source_country, levels = top_country ) )


gg <- ggplot( top_country_attacks, aes(x = hour, y = wkday, fill = n) ) + 
      geom_tile(color = "white", size = 0.1) +
      scale_fill_viridis(name = "# Events") + 
      coord_equal() + 
      facet_wrap( ~source_country, ncol = 2 ) +
      labs(x = NULL, y = NULL, title = "Events per weekday & time of day by country\n") + 
      theme_tufte(base_family = "Helvetica") + 
      theme( axis.ticks = element_blank(),
             axis.text = element_text(size = 8),
             panel.border = element_blank(),
             plot.title = element_text(hjust = 0.5),
             strip.text = element_text(hjust = 0.5),
             panel.margin = unit(0.1, "cm"),
             legend.position = "bottom",
             legend.title = element_text(size = 8),
             legend.text = element_text(size = 6),
             legend.key.size = unit(0.4, "cm"),
             legend.key.width = unit(1, "cm") )
gg

plots <- lapply(top_country, function(x) {

    subset_data <- top_country_attacks %>% filter(source_country == x)
    gg <- ggplot( subset_data, aes(x = hour, y = wkday, fill = n) ) + 
          geom_tile(color = "white", size = 0.1) +
          scale_fill_viridis(name = "# Events") + 
          scale_y_discrete( expand = c(0, 0) ) +
          coord_equal() + 
          labs(x = NULL, y = NULL, title = x) + 
          theme_tufte(base_family = "Helvetica") + 
          theme( axis.ticks = element_blank(),
                 axis.text = element_text(size = 7),
                 panel.border = element_blank(),
                 plot.title = element_text(hjust = 0.5),
                 strip.text = element_text(hjust = 0.5),
                 panel.margin = unit(0.1, "cm"),
                 legend.position = "bottom",
                 legend.title = element_text(size = 6),
                 legend.text = element_text(size = 6),
                 legend.key.size = unit(0.4, "cm") )
    return(gg)
})

# specify the additional arguments to grid.arrange
# by adding it to the list that's going to be do.called
plots[['ncol']] = 2
do.call( grid.arrange, plots )

This type of calendar heatmap can also be used to detect patterns in IT support logs, the chart will then show the number of tickets raised (day-wise). We can then detect patterns to see commonly raised issues and figure out ways to address it.

R Session Information

devtools::session_info()
##  setting  value                       
##  version  R version 3.2.4 (2016-03-10)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2016-12-28                  
## 
##  package    * version date       source        
##  assertthat   0.1     2013-12-06 CRAN (R 3.2.0)
##  bookdown     0.1     2016-07-13 CRAN (R 3.2.5)
##  colorspace   1.2-6   2015-03-11 CRAN (R 3.2.0)
##  data.table * 1.10.0  2016-12-03 CRAN (R 3.2.5)
##  DBI          0.4-1   2016-05-08 CRAN (R 3.2.5)
##  devtools     1.12.0  2016-06-24 CRAN (R 3.2.5)
##  digest       0.6.9   2016-01-08 CRAN (R 3.2.3)
##  dplyr      * 0.5.0   2016-06-24 CRAN (R 3.2.5)
##  evaluate     0.9     2016-04-29 cran (@0.9)   
##  formatR      1.4     2016-05-09 cran (@1.4)   
##  ggplot2    * 2.2.0   2016-11-11 CRAN (R 3.2.5)
##  ggthemes   * 3.0.3   2016-04-09 CRAN (R 3.2.4)
##  gridExtra  * 2.2.1   2016-02-29 CRAN (R 3.2.4)
##  gtable       0.2.0   2016-02-26 CRAN (R 3.2.3)
##  highr        0.6     2016-05-09 cran (@0.6)   
##  htmltools    0.3.5   2016-03-21 CRAN (R 3.2.4)
##  httpuv       1.3.3   2015-08-04 CRAN (R 3.2.0)
##  knitr        1.14    2016-08-13 CRAN (R 3.2.4)
##  labeling     0.3     2014-08-23 CRAN (R 3.2.0)
##  lazyeval     0.2.0   2016-06-12 CRAN (R 3.2.5)
##  lubridate  * 1.5.6   2016-04-06 CRAN (R 3.2.4)
##  magrittr     1.5     2014-11-22 CRAN (R 3.2.0)
##  memoise      1.0.0   2016-01-29 CRAN (R 3.2.3)
##  mime         0.4     2015-09-03 CRAN (R 3.2.0)
##  miniUI       0.1.1   2016-01-15 CRAN (R 3.2.3)
##  munsell      0.4.3   2016-02-13 CRAN (R 3.2.3)
##  plyr         1.8.4   2016-06-08 cran (@1.8.4) 
##  questionr    0.5     2016-03-15 CRAN (R 3.2.4)
##  R6           2.1.2   2016-01-26 CRAN (R 3.2.3)
##  Rcpp         0.12.5  2016-05-14 cran (@0.12.5)
##  rmarkdown    1.1     2016-10-16 CRAN (R 3.2.4)
##  rmdformats   0.3     2016-09-05 CRAN (R 3.2.5)
##  rstudioapi   0.6     2016-06-27 CRAN (R 3.2.5)
##  scales     * 0.4.1   2016-11-09 CRAN (R 3.2.5)
##  shiny        0.13.2  2016-03-28 CRAN (R 3.2.4)
##  stringi      1.0-1   2015-10-22 CRAN (R 3.2.0)
##  stringr      1.0.0   2015-04-30 CRAN (R 3.2.0)
##  tibble       1.2     2016-08-26 CRAN (R 3.2.5)
##  viridis    * 0.3.4   2016-03-12 CRAN (R 3.2.4)
##  withr        1.0.1   2016-02-04 CRAN (R 3.2.3)
##  xtable       1.8-2   2016-02-05 CRAN (R 3.2.3)
##  yaml         2.1.13  2014-06-12 CRAN (R 3.2.0)

Ethen Liu

2016-12-28