Cereals <- read.csv("~/Ch28_Cereals.csv")
Here's an idea of what the data looks like:
head(Cereals)
## name brand IsKellogs calories sugars carbo protein
## 1 100%_Bran Nabisco no 70 6 5.0 4
## 2 100%_Natural_Bran Quaker no 120 8 8.0 3
## 3 All-Bran Kellogs yes 70 5 7.0 4
## 4 All-Bran_with_Extra_Fiber Kellogs yes 50 0 8.0 4
## 5 Almond_Delight Ralston no 110 8 14.0 2
## 6 Apple_Cinnamon_Cheerios Gen-Mills no 110 10 10.5 2
## fat sodium fiber potass shelf
## 1 1 130 10.0 280 Lower
## 2 5 15 2.0 135 Lower
## 3 1 260 9.0 320 Lower
## 4 0 140 14.0 330 Lower
## 5 2 200 1.0 -1 Lower
## 6 2 180 1.5 70 Upper
Remove missing values with the na.rm option
mean(Cereals$sugars)
## [1] 6.986842
median(Cereals$sugars)
## [1] 7
sd(Cereals$sugars, na.rm=TRUE)
## [1] 4.428675
IQR(Cereals$sugars)
## [1] 8
summary(Cereals$sugars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 7.000 6.987 11.000 15.000
Make a new dataset with only the brands that aren't Kellogs:
newdata <- subset(Cereals, brand !="Kellogs")
Make a new dataset with only Kellogs:
newdata <- subset(Cereals, brand =="Kellogs")
Make a new dataset with only the cereals with less than 100 calories:
newdata <- subset(Cereals, calories < 100)
qqnorm(Cereals$sugars)
qqline(Cereals$sugars)
cor(Cereals$sugars, Cereals$carbo, use="pairwise.complete.obs")
## [1] -0.4775783
lm stands for linear model.
lm(Cereals$sugars ~ Cereals$carbo)
##
## Call:
## lm(formula = Cereals$sugars ~ Cereals$carbo)
##
## Coefficients:
## (Intercept) Cereals$carbo
## 14.9833 -0.5392
summary(lm(Cereals$sugars ~ Cereals$carbo))
##
## Call:
## lm(formula = Cereals$sugars ~ Cereals$carbo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6693 -2.7060 -0.3554 3.5073 7.1839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.9833 1.7682 8.474 1.64e-12 ***
## Cereals$carbo -0.5392 0.1153 -4.676 1.29e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.917 on 74 degrees of freedom
## Multiple R-squared: 0.2281, Adjusted R-squared: 0.2176
## F-statistic: 21.86 on 1 and 74 DF, p-value: 1.286e-05
plot(jitter(Cereals$carbo), jitter(Cereals$sugars))
abline(lm(Cereals$sugars ~ Cereals$carbo))
Finding the residuals:
temp <- resid(lm(Cereals$sugars ~ Cereals$carbo))
plot(Cereals$carbo,temp)
abline(lm(temp ~ Cereals$carbo))
If you have a missing value, use the option na.action=na.exclude:
Cereals$carbo[1] <- NA
In this new variable, "Kellogs" gets converted to 1 and other values get converted to 0.
Kellogscheck <- ifelse(Cereals$brand == "Kellogs", 1, 0)
table(Kellogscheck)
## Kellogscheck
## 0 1
## 53 23
Cereals$protein <- as.factor(Cereals$protein)
library(plyr)
## Warning: package 'plyr' was built under R version 3.6.3
levels(Cereals$brand) <- sub("^Ralston$", "Other", levels(Cereals$brand))
combinedCereals <- revalue(Cereals$brand, c("Ralston"="Other", "Quaker"="Other", "Post"="Other"))
barplot(table(combinedCereals))
sqrtfiber <- sqrt(Cereals$fiber)
logfiber <- log(Cereals$fiber)
boxplot(Cereals$fiber, sqrtfiber, logfiber)
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z$group
## == : Outlier (-Inf) in boxplot 3 is not drawn
Re-expressing and un-re-expressing invididual values:
log(10)
## [1] 2.302585
exp(2.302585)
## [1] 9.999999
sqrt(10)
## [1] 3.162278
(3.162278)^2
## [1] 10