vignettes/example-gallery-05-scatter-plots.Rmd
example-gallery-05-scatter-plots.Rmd
This document is adapted from the Scatter Plots section of the Altair Example Gallery.
Our first step is to set up our environment:
library("altair")
library("tibble")
library("dplyr")
library("purrr")
library("tidyr")
library("jsonlite")
vega_data <- import_vega_data()
glimpse(fromJSON(vega_data$movies$url))
#> Rows: 3,201
#> Columns: 16
#> $ Title <chr> "The Land Girls", "First Love, Last Rites", "I …
#> $ US_Gross <int> 146083, 10876, 203134, 373615, 1009819, 24551, …
#> $ Worldwide_Gross <dbl> 146083, 10876, 203134, 373615, 1087521, 2624551…
#> $ US_DVD_Sales <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ Production_Budget <int> 8000000, 300000, 250000, 300000, 1000000, 16000…
#> $ Release_Date <chr> "Jun 12 1998", "Aug 07 1998", "Aug 28 1998", "S…
#> $ MPAA_Rating <chr> "R", "R", NA, NA, "R", NA, "R", "R", "R", NA, N…
#> $ Running_Time_min <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ Distributor <chr> "Gramercy", "Strand", "Lionsgate", "Fine Line",…
#> $ Source <chr> NA, NA, NA, NA, "Original Screenplay", NA, NA, …
#> $ Major_Genre <chr> NA, "Drama", "Comedy", "Comedy", "Drama", NA, N…
#> $ Creative_Type <chr> NA, NA, NA, NA, "Contemporary Fiction", NA, NA,…
#> $ Director <chr> NA, NA, NA, NA, NA, NA, "Christopher Nolan", NA…
#> $ Rotten_Tomatoes_Rating <int> NA, NA, NA, 13, 62, NA, NA, NA, 25, 86, 81, 84,…
#> $ IMDB_Rating <dbl> 6.1, 6.9, 6.8, NA, 3.4, NA, 7.7, 3.8, 5.8, 7.0,…
#> $ IMDB_Votes <int> 1071, 207, 865, NA, 165, NA, 15133, 353, 3275, …
glimpse(vega_data$cars())
#> Rows: 406
#> Columns: 9
#> $ Name <chr> "chevrolet chevelle malibu", "buick skylark 320", "pl…
#> $ Miles_per_Gallon <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, NaN, NaN, NaN…
#> $ Cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8,…
#> $ Displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 133…
#> $ Horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 115…
#> $ Weight_in_lbs <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425,…
#> $ Acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8…
#> $ Year <dttm> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 1970…
#> $ Origin <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA…
source <- vega_data$cars()
# Brush for selection
brush <- alt$selection(type = "interval")
# Scatter Plot
points <-
alt$Chart(source)$
mark_point()$
encode(
x = "Horsepower:Q",
y = "Miles_per_Gallon:Q",
color = alt$condition(brush, "Cylinders:O", alt$value("grey"))
)$add_selection(brush)
# Base chart for data tables
ranked_text <-
alt$Chart(source)$
mark_text()$
encode(
y = alt$Y('row_number:O', axis = NULL)
)$
transform_window(row_number = "row_number()")$
transform_filter(brush)$
transform_window(rank = "rank(row_number)")$
transform_filter("datum.rank<20")
# Data Tables
horsepower <-
ranked_text$
encode(
text = "Horsepower:N"
)$
properties(title = "Horsepower")
mpg <-
ranked_text$
encode(
text = "Miles_per_Gallon:N"
)$
properties(title = "MPG")
origin <-
ranked_text$
encode(
text = "Origin:N"
)$
properties(title = "Origin")
text <- (horsepower | mpg | origin) # Combine data tables
# Build chart
chart <- (points | text)$resolve_legend(color = "independent")
chart
glimpse(vega_data$cars())
#> Rows: 406
#> Columns: 9
#> $ Name <chr> "chevrolet chevelle malibu", "buick skylark 320", "pl…
#> $ Miles_per_Gallon <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, NaN, NaN, NaN…
#> $ Cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8,…
#> $ Displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 133…
#> $ Horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 115…
#> $ Weight_in_lbs <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425,…
#> $ Acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8…
#> $ Year <dttm> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 1970…
#> $ Origin <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA…
This example shows how layering can be used to build a plot. This dataset tracks miles driven per capita along with gas prices annually from 1956 to 2010. It is based on the May 2, 2010 New York Times article ‘Driving Shifts Into Reverse’. See this reference.
glimpse(vega_data$driving())
#> Rows: 55
#> Columns: 4
#> $ side <chr> "left", "right", "bottom", "top", "right", "bottom", "right", "b…
#> $ year <dbl> 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966…
#> $ miles <dbl> 3675, 3706, 3766, 3905, 3935, 3977, 4085, 4218, 4369, 4538, 4676…
#> $ gas <dbl> 2.38, 2.40, 2.26, 2.31, 2.27, 2.25, 2.22, 2.12, 2.11, 2.14, 2.14…
lines <-
alt$Chart(vega_data$driving())$
mark_line()$
encode(
x = alt$X("miles", scale = alt$Scale(zero = FALSE)),
y = alt$Y("gas", scale = alt$Scale(zero = FALSE)),
order = "year"
)
points <-
alt$Chart(vega_data$driving())$
mark_circle()$
encode(
alt$X("miles", scale = alt$Scale(zero = FALSE)),
alt$Y("gas", scale = alt$Scale(zero = FALSE))
)
chart <- (lines + points)
chart
This example shows how to make a dot-dash plot presented in Edward Tufte’s book Visual Display of Quantitative Information on page 133. This example is based on g3o2’s block.
glimpse(vega_data$cars())
#> Rows: 406
#> Columns: 9
#> $ Name <chr> "chevrolet chevelle malibu", "buick skylark 320", "pl…
#> $ Miles_per_Gallon <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, NaN, NaN, NaN…
#> $ Cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8,…
#> $ Displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 133…
#> $ Horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 115…
#> $ Weight_in_lbs <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425,…
#> $ Acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8…
#> $ Year <dttm> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 1970…
#> $ Origin <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA…
cars <- vega_data$cars()
brush <- alt$selection(type = "interval")
tick_axis <- alt$Axis(labels=FALSE, domain=FALSE, ticks=FALSE)
tick_axis_notitle <-
alt$Axis(labels=FALSE, domain=FALSE, ticks=FALSE, title="")
points <-
alt$Chart(cars)$
mark_point()$
encode(
x = alt$X("Miles_per_Gallon", axis = alt$Axis(title = "")),
y = alt$Y("Horsepower", axis = alt$Axis(title = "")),
color = alt$condition(brush, "Origin", alt$value("grey"))
)$
properties(selection = brush)
x_ticks <-
alt$Chart(cars)$
mark_tick()$
encode(
x = alt$X("Miles_per_Gallon", axis = tick_axis),
y = alt$Y("Origin", axis = tick_axis_notitle),
color = alt$condition(brush, "Origin", alt$value("lightgrey"))
)$
properties(selection = brush)
y_ticks <-
alt$Chart(cars)$
mark_tick()$
encode(
alt$X("Origin", axis = tick_axis_notitle),
alt$Y("Horsepower", axis = tick_axis),
color=alt$condition(brush, "Origin", alt$value("lightgrey"))
)$
properties(selection = brush)
chart <- (y_ticks | (points & x_ticks))
chart
We will use the cars dataset here.
glimpse(vega_data$cars())
#> Rows: 406
#> Columns: 9
#> $ Name <chr> "chevrolet chevelle malibu", "buick skylark 320", "pl…
#> $ Miles_per_Gallon <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, NaN, NaN, NaN…
#> $ Cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8,…
#> $ Displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 133…
#> $ Horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 115…
#> $ Weight_in_lbs <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425,…
#> $ Acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8…
#> $ Year <dttm> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 1970…
#> $ Origin <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA…
n <- 40
data <-
tibble(
x = runif(n)**2,
y = 10 - 1 / (x + 0.1) + runif(n)
)
degree <- c(1, 3, 5)
model_poly <-
map(degree, ~lm(y ~ poly(x, degree = .x), data = data))
x_pred <- seq(min(data$x), max(data$x), length.out = 500)
data_predict <-
crossing(degree, x = x_pred) %>%
nest(x) %>%
mutate(y = map2(model_poly, data, predict)) %>%
unnest()
#> Warning: Supplying `...` without names was deprecated in tidyr 1.0.0.
#> ℹ Please specify a name for each selection.
#> ℹ Did you want `data = x`?
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> Warning: `cols` is now required when using `unnest()`.
#> ℹ Please use `cols = c(data, y)`.
glimpse(data)
#> Rows: 40
#> Columns: 2
#> $ x <dbl> 0.031205102, 0.015095396, 0.671742465, 0.259855934, 0.284727607, 0.8…
#> $ y <dbl> 2.9709110, 1.9634312, 9.4441738, 7.6829600, 7.5133925, 9.1575295, 9.…
glimpse(data_predict)
#> Rows: 1,500
#> Columns: 3
#> $ degree <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ x <dbl> 6.695829e-05, 2.008303e-03, 3.949647e-03, 5.890992e-03, 7.83233…
#> $ y <dbl> 3.583545, 3.600563, 3.617581, 3.634599, 3.651616, 3.668634, 3.6…
glimpse(vega_data$cars())
#> Rows: 406
#> Columns: 9
#> $ Name <chr> "chevrolet chevelle malibu", "buick skylark 320", "pl…
#> $ Miles_per_Gallon <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, NaN, NaN, NaN…
#> $ Cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8,…
#> $ Displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 133…
#> $ Horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 115…
#> $ Weight_in_lbs <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425,…
#> $ Acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8…
#> $ Year <dttm> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 1970…
#> $ Origin <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA…
chart <-
alt$Chart(vega_data$cars())$
mark_circle()$
encode(
x = alt$X(alt$`repeat`("column"), type = "quantitative"),
y = alt$Y(alt$`repeat`("row"), type = "quantitative"),
color = "Origin:N"
)$
properties(width = 150, height = 150)$
`repeat`(
row = list("Horsepower", "Acceleration", "Miles_per_Gallon"),
column = list("Miles_per_Gallon", "Acceleration", "Horsepower")
)$
interactive()
chart
glimpse(vega_data$cars())
#> Rows: 406
#> Columns: 9
#> $ Name <chr> "chevrolet chevelle malibu", "buick skylark 320", "pl…
#> $ Miles_per_Gallon <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, NaN, NaN, NaN…
#> $ Cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8,…
#> $ Displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 133…
#> $ Horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 115…
#> $ Weight_in_lbs <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425,…
#> $ Acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8…
#> $ Year <dttm> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 1970…
#> $ Origin <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA…
chart <-
alt$Chart(vega_data$cars())$
transform_calculate(
"url", "'https://www.google.com/search?q=' + datum.Name"
)$
mark_point()$
encode(
x = "Horsepower:Q",
y = "Miles_per_Gallon:Q",
color = "Origin:N",
href = "url:N",
tooltip = list("Name:N", "url:N")
)
chart
glimpse(vega_data$seattle_weather())
#> Rows: 1,461
#> Columns: 6
#> $ date <dttm> 2012-01-01, 2012-01-02, 2012-01-03, 2012-01-04, 2012-01…
#> $ precipitation <dbl> 0.0, 10.9, 0.8, 20.3, 1.3, 2.5, 0.0, 0.0, 4.3, 1.0, 0.0,…
#> $ temp_max <dbl> 12.8, 10.6, 11.7, 12.2, 8.9, 4.4, 7.2, 10.0, 9.4, 6.1, 6…
#> $ temp_min <dbl> 5.0, 2.8, 7.2, 5.6, 2.8, 2.2, 2.8, 2.8, 5.0, 0.6, -1.1, …
#> $ wind <dbl> 4.7, 4.5, 2.3, 4.7, 6.1, 2.2, 2.3, 2.0, 3.4, 3.4, 5.1, 1…
#> $ weather <chr> "drizzle", "rain", "rain", "rain", "rain", "rain", "rain…
source <- vega_data$seattle_weather()
line <-
alt$Chart(source)$
mark_line(
color = "red",
size = 3
)$
transform_window(
rolling_mean = "mean(temp_max)",
frame = c(-15, 15)
)$
encode(
x = "date:T",
y = "rolling_mean:Q"
)
points <-
alt$Chart(source)$
mark_point()$
encode(
x = "date:T",
y = alt$Y(
"temp_max:Q",
axis = alt$Axis(title = "Max Temp")
)
)
chart <- points + line
chart
glimpse(source)
#> Rows: 5
#> Columns: 3
#> $ x <int> 1, 2, 3, 4, 5
#> $ y <dbl> 10.631477, 9.836883, 10.664900, 10.636215, 10.207321
#> $ yerr <dbl> 0.2, 0.2, 0.2, 0.2, 0.2
# the base chart
base <-
alt$Chart(source)$
transform_calculate(
ymin = "datum.y-datum.yerr",
ymax = "datum.y+datum.yerr"
)
# generate the points
points <-
base$
mark_point(filled = TRUE, size = 50, color = "black")$
encode(
x=alt$X("x", scale = alt$Scale(domain=c(0, 6))),
y=alt$Y("y", scale = alt$Scale(domain=c(10, 11)))
)
# generate the error bars
errorbars <-
base$
mark_errorbar()$
encode(
x = "x",
y = "ymin:Q",
y2 = "ymax:Q"
)
points + errorbars
Layering can now work with the +
operator as it does in
Python. Additionally, it can be declared as
alt$layer(chart1, chart2)
or as
alt$ChartLayer(layer = list(chart1, chart2))
glimpse(data)
#> Rows: 5
#> Columns: 3
#> $ x <dbl> 1, 3, 5, 7, 9
#> $ y <dbl> 1, 3, 5, 7, 9
#> $ label <chr> "A", "B", "C", "D", "E"
glimpse(fromJSON(vega_data$movies$url))
#> Rows: 3,201
#> Columns: 16
#> $ Title <chr> "The Land Girls", "First Love, Last Rites", "I …
#> $ US_Gross <int> 146083, 10876, 203134, 373615, 1009819, 24551, …
#> $ Worldwide_Gross <dbl> 146083, 10876, 203134, 373615, 1087521, 2624551…
#> $ US_DVD_Sales <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ Production_Budget <int> 8000000, 300000, 250000, 300000, 1000000, 16000…
#> $ Release_Date <chr> "Jun 12 1998", "Aug 07 1998", "Aug 28 1998", "S…
#> $ MPAA_Rating <chr> "R", "R", NA, NA, "R", NA, "R", "R", "R", NA, N…
#> $ Running_Time_min <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ Distributor <chr> "Gramercy", "Strand", "Lionsgate", "Fine Line",…
#> $ Source <chr> NA, NA, NA, NA, "Original Screenplay", NA, NA, …
#> $ Major_Genre <chr> NA, "Drama", "Comedy", "Comedy", "Drama", NA, N…
#> $ Creative_Type <chr> NA, NA, NA, NA, "Contemporary Fiction", NA, NA,…
#> $ Director <chr> NA, NA, NA, NA, NA, NA, "Christopher Nolan", NA…
#> $ Rotten_Tomatoes_Rating <int> NA, NA, NA, 13, 62, NA, NA, NA, 25, 86, 81, 84,…
#> $ IMDB_Rating <dbl> 6.1, 6.9, 6.8, NA, 3.4, NA, 7.7, 3.8, 5.8, 7.0,…
#> $ IMDB_Votes <int> 1071, 207, 865, NA, 165, NA, 15133, 353, 3275, …
source <- vega_data$movies$url
stripplot <-
alt$Chart(source, width = 40)$
transform_calculate(
# Generate Gaussian jitter with a Box-Muller transform
jitter = 'sqrt(-2*log(random()))*cos(2*PI*random())'
)$
mark_circle(size = 8)$
encode(
x = alt$X(
'jitter:Q',
title = " ",
axis = alt$Axis(
values = list(0),
ticks = TRUE,
grid = FALSE,
labels = FALSE
),
scale = alt$Scale()
),
y = alt$Y("IMDB_Rating:Q"),
color = alt$Color("Major_Genre:N", legend = NULL),
column = alt$Column(
"Major_Genre:N",
header = alt$Header(
labelAngle = -90,
titleOrient = "top",
labelOrient = "bottom",
labelAlign = "right",
labelPadding = 3
)
)
)$
configure_facet(spacing = 0)$
configure_view(stroke = "transparent")
stripplot
glimpse(vega_data$github())
#> Rows: 955
#> Columns: 2
#> $ time <dttm> 2015-01-01 01:00:00, 2015-01-01 04:00:00, 2015-01-01 05:00:00, …
#> $ count <dbl> 2, 3, 1, 1, 3, 1, 3, 1, 1, 2, 1, 3, 2, 2, 3, 1, 2, 2, 1, 2, 2, 1…
This example demonstrates the need for a
ggplot2::facet_wrap()
-like capability, which we understand
is coming to Vega-Lite in the no-so-distant future.
glimpse(vega_data$cars())
#> Rows: 406
#> Columns: 9
#> $ Name <chr> "chevrolet chevelle malibu", "buick skylark 320", "pl…
#> $ Miles_per_Gallon <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, NaN, NaN, NaN…
#> $ Cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8,…
#> $ Displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 133…
#> $ Horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 115…
#> $ Weight_in_lbs <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425,…
#> $ Acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8…
#> $ Year <dttm> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 1970…
#> $ Origin <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA…