I'm using a modified version of the Cereals dataset from Chapter 28 the textbook as an example. The original dataset is available here: datasets from the text

For many more graphs using R, see the R graph gallery.

Cereals <- read.csv("~/Ch28_Cereals.csv")

Here's an idea of what the data looks like:

head(Cereals)
##                        name     brand IsKellogs calories sugars carbo protein
## 1                 100%_Bran   Nabisco        no       70      6   5.0       4
## 2         100%_Natural_Bran    Quaker        no      120      8   8.0       3
## 3                  All-Bran   Kellogs       yes       70      5   7.0       4
## 4 All-Bran_with_Extra_Fiber   Kellogs       yes       50      0   8.0       4
## 5            Almond_Delight   Ralston        no      110      8  14.0       2
## 6   Apple_Cinnamon_Cheerios Gen-Mills        no      110     10  10.5       2
##   fat sodium fiber potass shelf
## 1   1    130  10.0    280 Lower
## 2   5     15   2.0    135 Lower
## 3   1    260   9.0    320 Lower
## 4   0    140  14.0    330 Lower
## 5   2    200   1.0     -1 Lower
## 6   2    180   1.5     70 Upper

To load ggplot for fancier graphics:

install.packages("ggplot2")

library("ggplot2")
## Warning: package 'ggplot2' was built under R version 3.6.3

Histograms

hist(Cereals$sugars)

A fancier histogram still without using ggplot:

hist(Cereals$sugars, breaks=5, freq=FALSE, density=50, col="red", , main="Sugar content in cereals", xlab="sugar content (g)")

Specifying specifc break points for the bars:
hist(dataset$variable, breaks=c(0, 7, 12,15))

Specifying break points every 10 units, from a minimum of 0 to a maximum of 120:
hist(dataset$variable, breaks=seq(0,120,10))

Adding a normal curve:
hist(dataset$variable, freq=FALSE)
curve(dnorm(x, mean=number, sd=number), add=TRUE, col="red")

Using ggplot:

ggplot(Cereals, aes(x=sugars))+
  geom_histogram(bins=5)+
  theme_bw()+
  ggtitle("Histogram of sugar content in cereals")

Boxplots

boxplot(Cereals$sugars, ylab="sugar content (g)")

Using ggplot:

ggplot(Cereals, aes(y=sugars))+
  geom_boxplot()+
  theme_bw()+
  ggtitle("Boxplot of sugar content in cereals")

Side-by-side boxplots

Each box is a category of some categorical variable:

boxplot(Cereals$sugars ~ Cereals$brand)

Each box is a different quantitative variable:

boxplot(Cereals$calories, Cereals$carbo, Cereals$sodium)

Using ggplot:

ggplot(Cereals, aes(x=factor(brand), y=sugars, fill=factor(brand)))+
  geom_boxplot()+
  theme_bw()+
  ggtitle("Side-by-Side Boxplot of Sugar Content by Brand")

Dotplots

Using ggplot:

ggplot(Cereals, aes(x=sugars))+
  geom_dotplot()+
  theme_bw()+
  ggtitle("Dot Plot of sugars in cereal brands")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Density plots

Using ggplot:

ggplot(Cereals, aes(x=carbo))+
  geom_density()+
  theme_bw()+
  ggtitle("Density Plot of carbohydrates")

Stem-and-leaf display

stem(Cereals$potass)
## 
##   The decimal point is 2 digit(s) to the right of the |
## 
##   -0 | 00
##   0 | 2233333333444444444
##   0 | 55555666666778999999
##   1 | 0000001111111222233444
##   1 | 667799
##   2 | 034
##   2 | 68
##   3 | 23

Scatterplots

plot(Cereals$calories, Cereals$sugars)

plot(jitter(Cereals$calories), jitter(Cereals$sugars))

Changing x and y ranges:
plot(dataset\(variable1, dataset\)variable2, xlim=c(0,100), ylim=c(0,100))

Comparing multiple variables in a matrix
pairs(dataset\(variable1 ~ dataset\)variable2+ dataset$variable3)

Pie charts

pie(table(Cereals$brand))

Side-by-side pie charts

pie(table(subset(Cereals$brand, Cereals$shelf=="Upper")))

pie(table(subset(Cereals$brand, Cereals$shelf=="Lower")))

. Bar chart

barplot(table(Cereals$brand))

Combining categories:
Cereals\(brand[Cereals\)brand=="K"] <- "N"

With already summarized data:

barplot(c(5,4,3,5), names.arg = c("category1", "category2", "category3", "category4"))

Using ggplot:

ggplot(Cereals, aes(brand, fill=brand))+
  geom_bar()+
  ggtitle("Bar chart of cereal brand")

Stacked or segmented bar chart

Find color choices here

barplot(table(Cereals$shelf,Cereals$brand), col=c("darkslategray3","sandybrown")) 

Using ggplot:

ggplot(Cereals, aes(brand, fill=shelf))+
  geom_bar()+
  ggtitle("Bar Chart of Cereal brand by shelf location")

Grouped or clustered bar chart

barplot(table(Cereals$brand,Cereals$shelf), col=c("paleturquoise1","darkslategray3","paleturquoise4","darkseagreen3","sandybrown","chocolate"), beside=TRUE) 

Using ggplot:

ggplot(Cereals, aes(shelf, fill=brand))+
  geom_bar(position="dodge")+
  ggtitle("Bar Chart of Shelf Distribution by Brand")

With summarized data in a table:

titanic <- read.table(header=TRUE, text='Survival Class People
                       1 Alive First 202
                       2 Dead First 123
                       3 Alive Second 118
                       4 Dead Second 167
                       5 Alive Third 178
                       6 Dead Third 528
                       7 Alive Crew 212
                       8 Dead Crew 673')
ggplot(titanic, aes(Survival, People, fill = Class)) +  geom_bar(stat="identity", position = "dodge")

Frequency tables

table(Cereals$brand)
## 
## Gen-Mills   Kellogs   Nabisco      Post    Quaker   Ralston 
##        22        23         6         9         8         8

As percentages or proportions instead of counts:

prop.table(table(Cereals$brand))
## 
##  Gen-Mills    Kellogs    Nabisco       Post     Quaker    Ralston 
## 0.28947368 0.30263158 0.07894737 0.11842105 0.10526316 0.10526316

Contingency table

table(Cereals$brand, Cereals$shelf)
##            
##             Lower Upper
##   Gen-Mills    16     6
##   Kellogs      19     4
##   Nabisco       3     3
##   Post          7     2
##   Quaker        7     1
##   Ralston       4     4