An R Function and Procedures Reference Following is an alphabetical list of most of the functions used in these tutorials, and a few choice others. If you've been looking at R help pages, you've probably discovered by now that the examples there can be positively Martian! So I've tried to supply simple examples for most of these functions. The output produced by the commands in the examples is not shown. The arguments given in the function syntax attempt to convey what type of objects the function works on, and are not necessarily the correct argument names, for which you should see the help pages. The data sets used in the examples are all built it. Get info about them by help(dataset_name) or summary(dataset_name).
abline(a=intercept, b=slope) - draw a line on an existing graph abline(h=y_value) - draw a horizontal line on an existing graph abline(v=x_value) - draw a vertical line on an existing graph abline(lm_model) - plot the regression line on an existing scatterplot > plot(women) # a graph to work with > lm.out = lm(weight ~ height, data=women) # an lm_model to work with > abline(lm.out) # draw the regression line > abline(v=64) # a vertical line at x=64 > abline(h=132) # a horizontal line at y=132 > abline(a=-170, b=5, lty="dashed") # line with arbtrary intercept and slope abs(value or vector) - absolute value > abs(-9) > abs(c(-9,10,11,-20,3)) acf(vector) - autocorrelation function (draws an autocorrelation plot) > acf(rivers) > acf(sunspots, lag.max=150) addmargins(contingency_table) - print contingency table with marginal sums > HairEyeColor > addmargins(HairEyeColor) aggregate(formula, data, function) - split variable into groups and calculate function > summary(warpbreaks) # a data frame > aggregate(breaks ~ wool, data=warpbreaks, mean) > aggregate(breaks ~ tension, data=warpbreaks, median) > aggregate(breaks ~ wool + tension, data=warpbreaks, FUN=mean) all(logical_vector) - are all the values in this vector TRUE? (see any) > all(c(T,T,T,T,T,T,F,T,T,T)) anova(lm_model) - see the anova on stored model of a linear regression anova(lm_model1, lm_model2) - test two stored regression models against each other > lm.out = lm(mpg ~ hp + wt, data=mtcars) # an lm_model > lm.out2 = lm(mpg ~ wt, data=mtcars) # another lm_model (subset of first) > anova(lm.out) # sequential ANOVA > anova(lm.out2, lm.out) # hp contributes significantly any(logical_vector) - are any of the values in this vector TRUE? (see all) > any(c(F,F,F,T,F,F,F,F,F,F)) > any(c(4<2, 6>9, 8>=4)) aov(formula, data) - ANOVA using the given model formula and dataframe (Type I SS if unbalanced) > summary(warpbreaks) # a data frame > aov.out = aov(breaks ~ wool * tension, data=warpbreaks) > summary(aov.out) apropos("string") - return all function names of which "string" is part > apropos("mean") args(function_name) - sometimes shows arguments taken by a function (not always!) > args(plot) # fairly useless > args(plot.default) # unless you know the tricks array(data, dim) - create an array of given dimensions from data > array(data=1:20, dim=c(2,5,2)) as.data.frame(table or matrix) - coerce a table-like object to a data frame > class(state.x77) # it's a matrix > states = as.data.frame(state.x77) > class(states) # now it's a data frame as.data.frame.table(contingency_table) - coerce contingency table to a data frame > HairEyeColor # a contingency table > as.data.frame.table(HairEyeColor) as.matrix(all_numeric_tablelike_object) - coerce to a matrix > str(USArrests) # it's an all numeric data frame > arrests = as.matrix(USArrests) > class(arrests) # not any more attach(dataframe_name) - attach a dataframe to the search path (see detach) > search() # look at the search path > attach(women) > search() # "women" is now in position 2 > detach(women) > search() # and now it's gone barplot(vector or table_name) - plot a bar graph of values in table (must be tabled first) barplot(2D_table_name, beside=T, legend=T) - side-by-side bar graph with legend > means = with(warpbreaks, tapply(breaks, tension, mean)) # a vector of means > barplot(means) > HairEyeColor[,,1] # a 2D contingency table > barplot(HairEyeColor[,,1], beside=T, legend=T) bartlett.test(formula, data) - test for homogeneity of variance (fails with 2-way classifications) > summary(warpbreaks) # a data frame > bartlett.test(breaks ~ tension, data=warpbreaks) > bartlett.test(breaks ~ tension + wool, data=warpbreaks) # fail! binom.test(x=number_of_successes, n=trials, p=null_prob) - exact binomial test > binom.test(x=60, n=100, p=.5) # 60 heads in 100 coin tosses > binom.test(x=10, n=100, p=1/36) # 10 boxcars in 100 dice rolls boxplot(vector_name) - plot a boxplot of a numeric variable boxplot(formula, data) - plot boxplots at each level of an independent variable > boxplot(rivers) > boxplot(breaks ~ tension, data=warpbreaks) > boxplot(count ~ spray, data=InsectSprays) by(DV, IV, function) - break down DV by IV and calculate function (see tapply) > attach(warpbreaks) # a data frame to work with, attached to search path > names(warpbreaks) # see the variable names > by(breaks, tension, median) # group medians (breaks by tension) > by(breaks, tension, var) # group variances > detach(warpbreaks) # DON'T leave it attached to the search path! c(data_values) - concatenate; group values into a vector > xy = c(12, 17, 22, 18); xy # create and print a numeric vector > xz = c("Fred","Barney","Wilma","Betty"); xz # character vector > zz = c(T, F, F, T); zz # logical vector cbind(vector1, vector2, vector3, ...) - bind vectors into columns of a matrix > red = c(1,2,3,4) # create some vectors (must be same length) > blue = c(5,6,7,8) > green = c(9,10,11,12) > cbind(red, blue, green) > cbind(green, red, blue) > zz = cbind(red, blue) > cbind(zz, green); cbind(green, zz) chisq.test(vector_name, p=vector_null_probs) - chi square goodness of fit test chisq.test(table_name) - chi square significance test of independence chisq.test_result$expected - examine expected frequencies from chi square test chisq.test_result$residuals - examine residuals from chi square test > obs.freqs = c(15, 15, 20, 10) > chisq.test(obs.freqs) # test of all expected freqs. equal > chisq.test(obs.freqs, p = c(1/8, 1/4, 1/2, 1/8)) # test of unequal EFs > eyes = HairEyeColor[,,1] # a two-way contingency table > chisq.test(eyes) # chi square test of independence (warning: at least 1 EF<5) > chisq.test(eyes) -> chi.out # save output > chi.out$expected # see the expected freqs. > chi.out$resid # see cell residuals (unsquared chi values) choose(n, k) - number of possible combinations of n things taken k at a time > choose(n=10, k=5) > choose(15,4) class(object_name) - find out what kind of object R considers this to be > class(USArrests) # a data frame > class(state.x77) # a matrix > class(rivers) # numeric values (a vector) coef(lm_model) or coefficients(lm_model) - get regression coefficients > lm.out = lm(weight ~ height + I(height^2), data=women) # an lm_model to work with > coef(lm.out) # lm.out$coef does the same thing colMeans(matrix or table) - get column means from a table-like object > colMeans(state.x77) # a matrix > colMeans(USArrests) # an all numeric data frame colnames(tablelike_object) - create or get column names (see names) > head(women) > colnames(women) > colnames(women) = c("hgt","wgt") > head(women) colors() - display the names of colors available for graphing > colors() # be prepared for a lot of output > barplot(c(15,22,17), col=c("red","wheat","blue")) > rainbow(3) # get three colors from the rainbow palette > topo.colors(4) # four colors from the topo.colors palette > barplot(c(15,22,17), col=rainbow(3)) > barplot(c(15,22,17,28), col=heat.colors(4)) confint(lm_model) - get confidence intervals for regression coefficients > lm.out = lm(weight ~ height, data=women) # an lm_model to work with > confint(lm.out) contrasts(factor_name) - set or check contrasts for a factor > data(ToothGrowth) > names(ToothGrowth) > ToothGrowth$dose = factor(ToothGrowth$dose) > contrasts(ToothGrowth$dose) > contr.treatment(3) # "treatment" contrasts for a factor with 3 levels > contr.helmert(3) # helmert contrasts for a factor with 3 levels > contr.sum(3) # sum contrasts for a factor with 3 levels > contrasts(ToothGrowth$dose) = contr.helmert(3) > contrasts(ToothGrowth$dose) cooks.distance(lm_model) - report Cook's distances from a regression model > lm.out = lm(weight ~ height, data=women) # an lm_model to work with > cooks.distance(lm.out) > plot(cooks.distance(lm.out)) > plot(cooks.distance(lm.out), type="h") cor(vector1, vector2) - Pearson's r between numeric variables cor(vector1, vector2, use="pairwise.complete") - if there are missing values cor(vector1, vector2, method="spearman") - Spearman rank correlation cor(dataframe or matrix) - get a correlation matrix if all numeric variables > names(USArrests) > with(USArrests, cor(Murder, Rape)) > with(USArrests, cor(Murder, Rape, method="spearman")) > cor(USArrests) # an all numeric data frame > cor(state.x77) # a matrix > state.x77[20,2] = NA > cor(state.x77) # effect of a missing value > cor(state.x77, use="pairwise.complete") cor.test(vector1, vector2) - significance test for correlation cor.test(vector1, vector2, alternative="greater" or "less") - one-sided tests > cor.test(~ Murder + Rape, data=USArrests) > cor.test(~ Murder + UrbanPop, data=USArrests) > cor.test(~ Murder + UrbanPop, data=USArrests, alternative="greater") cos(angle_in_radians or vector_of_angles) - cosine function > cos(pi) > cos(c(pi/3, pi/4, pi, 5.01, 25.01)) cov(dataframe or matrix) - get a covariance matrix if all numeric variables > cov(USArrests) # an all numeric data frame > cov(state.x77) # a matrix curve(function_of_x, from=value, to=value) - plot an equation curve(function_of_x, from=value, to=value, add=T) - add curve to existing plot > curve(sin(x), from=0, to=6.28) > plot(weight ~ height, data=women) # get a graph to draw on > lm(weight ~ height + I(height^2), data=women) > curve(261.9 - 7.35 * x + 0.083 * x^2, add=T) cut(vector, breaks) - divide vector values into intervals defined by breaks > cut(rivers, breaks=4) > table(cut(rivers, breaks=4)) data(dataset_name) - copy a built in data set to workspace > data(women) > ls() # built in data frame "women" is now in workspace > data(anorexia, package="MASS") # get data from MASS even though it is not loaded data.frame(vector1, vector2, vector3, ...) - create a dataframe from vectors > subj.names = c("Terry","Joan","Emalee") > height = c(69, 65, 67) > age = c(40, 66, 28) > own.teeth = factor(c("yes","yes","yes")) > my.data = data.frame(subj.names, height, age, own.teeth) > my.data dataframe_name[i,] - get row i of the dataframe dataframe_name[i:j,] - get rows i through j of the dataframe dataframe_name[,k] - get column k of the dataframe; or add column to dataframe dataframe_name$variable_name - get column of dataframe with given variable name > data(women) # a data frame to work with > women # see it > women[7,] # row 7 > women[7:10,] # rows 7 through 10 > women[,2] # column 2 > women$weight # also column 2 > women[,3] = (1:15)^2 # create a new column > women date() - get the current system date and time > date() dbinom(x, size, prob) - binomial probability of x successes in size trials > dbinom(x=7, size=10, prob=.5) # probability of 7 heads in 10 coin toses > dbinom(x=3, size=10, prob=1/36) # probability of 3 snake eyes in 10 dice rolls density(vector) - kernel density smoothing of vector > density(faithful$eruptions) > plot(density(faithful$eruptions)) detach(dataframe_name) - remove dataframe from the search path (see attach) > search() # look at the search path > attach(women) > search() # "women" is now in position 2 > detach(women) > search() # and now it's gone difftime(time1, time2) - time between two times or dates > difftime("1955-09-12", "2016-02-23", units="days") # nice function, horrible help page! > difftime("1960-10-09 15:46:00", "2016-02-23 11:34:00", units="mins") dim(object_name) - get the dimensions of a table, matrix, or dataframe > dim(USArrests) # a data frame of 50 rows by 4 columns > dim(state.x77) # a matrix of 50 rows by 8 columns > dim(HairEyeColor) # a 4x4x2 contingency table dimnames(matrix or array or dataframe) - get or create dimension names > dimnames(state.x77) > my.table = array(1:20, dim=c(2,5,2)) > dimnames(my.table) = list(Sex=c("male","female"), + Pet=c("cat","dog","bird","turtle","snake"), + Married=c("yes","no")) > my.table dir() - view files in the working directory on your hard drive > dir() dnorm(value or vector, mean, sd) - normal probability density function (height of normal curve) > dnorm(120, mean=100, sd=15) > plot(dnorm(seq(from=-3,to=3,by=.1), mean=0, sd=1)) # mean=0, sd=1 are defaults dpois(x, lambda) - poisson probability function > dpois(x=3, lambda=8) # prob. of only 3 insect legs in a candy bar if average is 8 example(function_name) - get some examples for named function (sometimes useful) > example(friedman.test) # examples from the help page (somewhat sensible in this case) exp(value or vector) - raise e to the power of value; natural antilog > exp(1.632) > exp(1:20) factor(character_vector) - declare character_vector to be a factor factor(factor_name, labels) - set or change value labels for a factor > data(ToothGrowth) > summary(ToothGrowth) # one factor, two numeric variables > factor(ToothGrowth$dose) -> ToothGrowth$dose > summary(ToothGrowth) # now dose is also a factor > factor(ToothGrowth$dose, labels=c("Low","Moderate","High")) -> ToothGrowth$dose > summary(ToothGrowth) # now the levels of dose have value labels rather than numbers factorial(value or vector) - get value factorial (x!) > factorial(10) > factorial(1:10) fisher.test(contingency_table) - Fisher's exact test > TeaTasting <- matrix(c(3, 1, 1, 3), nrow = 2, + dimnames = list(Guess = c("Milk", "Tea"), + Truth = c("Milk", "Tea"))) # lady tasting tea problem > TeaTasting # a 2x2 contingency table > fisher.test(TeaTasting, alternative = "greater") # a brazen steal from the help page fivenum(vector) - five number summary of a numeric variable > fivenum(rivers) # compare to summary(rivers) for (i in 1:n) - start a loop that will run n times; put statements to be executed inside {} > for(i in 1:5) { + print(i) + print(i^2) + } ftable(contingency_table) - convert a contingency table to a flat table > HairEyeColor # a contingency table > ftable(HairEyeColor) function() - program your own functions into R > sem = function(x) sqrt(var(x,na.rm=T)/sum(!is.na(x))) # sqrt(var/n) > sem(rivers) # a newly created function for standard error of the mean getwd() - find out what your working directory is (see setwd) > getwd() gl(n, k) - generate n levels of a factor containing k values per level > gl(n=3, k=10, labels=c("frogs","hogs","dogs")) -> ogs > ogs glm(formula, family, data) - generalized linear model > data(menarche, package="MASS") > head(menarche) # number of girls reaching menarche by age group > glm(cbind(Menarche,Total-Menarche) ~ Age, family=binomial(logit), data=menarche) head(object_name) - see the first six cases in a data object > head(rivers) # a vector > head(USArrests) # a data frame > head(state.x77) # a matrix > head(rivers, n=10) # see 10 cases help(function_name) - view help page for a function > help(mean) # ?mean does the same thing help.search("topic") - see list of functions that might have to do with "topic" > help.search("mean") # ??mean does the same thing hist(vector) - plot a histogram of a numeric variable hist(vector, breaks=number or vector_of_desired_breaks) - with specified break points > hist(rivers) > hist(rivers, breaks=20) > hist(rivers, breaks=seq(from=0, to=4000, by=250)) I() - isolate a mathematical operation from the rest of a formula > lm.out = lm(weight ~ height + I(height^2), data=women) > summary(lm.out) ifelse(test, yes, no) - if test is TRUE do yes, else do no > ifelse(rivers > 1000, print("long"), print("short")) influence.measures(lm_model) - reports various measures of influence > lm.out = lm(weight ~ height, data=women) # an lm_model to work with > influence.measures(lm.out) interaction.plot(x.factor, trace.factor, response) - profile (interaction) plot > attach(ToothGrowth) # no formula interface here so attach is easiest > names(ToothGrowth) # get variable names > interaction.plot(x.factor=dose, trace.factor=supp, response=len) > detach(ToothGrowth) # get it out of search path when done! IQR(vector) - get the interquartile range using quantile() type=7 IQR(vector, type=2) - get "textbook" IQR > temp = c(3,7,8,6,5,1,8,3,4,5,2,6) > sort(temp) > IQR(temp) > IQR(temp, type=2) jitter(vector) - add a small amount of random error to values of a vector > jitter(1:7) > jitter(1:7, factor=.5) kruskal.test(formula, data) - Kruskal-Wallis rank-sum oneway ANOVA > names(InsectSprays) # count is numeric, spray is a factor > boxplot(count ~ spray, data=InsectSprays) # see what it looks like > kruskal.test(count ~ spray, data=InsectSprays) ks.test(vector1, vector2) - Kolmogorov-Smirnov test comparing distributions > ks.test(rnorm(100), rnorm(100)) # same distributions > ks.test(rnorm(100), runif(100)) # different distributions legend() - add a legend to an existing plot; complicated (see help page) > margin.table(Titanic, c(2,1)) # gender by class of ticket held on Titanic > margin.table(Titanic, c(2,1)) -> data.table > barplot(data.table, beside=T, axis.lty=1, ylim=c(0,1000), col=c("blue","pink")) > ### something to draw a legend on > legend(x=1, y=900, # where to put it + legend=c("males","females"), # what it should say + fill=c("blue","pink")) # colors matching bars length(vector) - get the no. of values in a vector, including NAs > length(rivers) letters - an object containing the lower case letters of the English alphabet LETTERS - an object containing the upper case letters of the English alphabet > letters > letters[5:10] > LETTERS[5:10] levels(factor_name) - get the levels of a categorical variable > levels(InsectSprays$spray) > levels(chickwts$feed) > summary(chickwts) library("library_name") - load an optional library, for example MASS > search() # see the search path > library("MASS") > search() # MASS library has been put in position 2 > detach("package:MASS") > search() # gone lines(x, y) - draw a line on an existing graph through (x,y) points > x = 1:10 > y = x^2 > plot(x, y) # plot x,y points > lines(x=x, y=y) # connect them with lines > lines(x=c(4,8), y=c(0,100), col="red") # line from (4,0) to (8,100) list(object_names) - create a list of the named objects > my.list = list(rivers, HairEyeColor, women) > summary(my.list) > my.list lm(formula, data) - linear regression of DV on IV using least squares lm(DV ~ IV + I(IV^2) + I(IV^3) + ..., data) - polynomial regression lm(DV ~ IV1 + IV2 + IV3 + ..., data) - multiple linear regression lm_model$fitted - examine the fitted values from a stored model lm_model$residuals - examine the residuals from a stored lm model > lm.out = lm(weight ~ height, data=women) > lm.out.cubic = lm(weight ~ height + I(height^2) + I(height^3), data=women) > summary(lm.out) # see results of first regression > summary(lm.out.cubic) # see the cubic model > anova(lm.out, lm.out.cubic) # the cubic model is significantly better > lm.out$fitted # fitted values for lm.out > lm.out$residuals # residuals for lm.out > plot(lm.out$fitted, lm.out$residuals) # clearly missed curvilinear trend > rm(lm.out, lm.out.cubic) # keep it clean! > lm.out = lm(mpg ~ disp + hp + wt, data=mtcars) # multiple regression > summary(lm.out) load("pathname_of_previously_saved_file") - load a "saved" file (see save) > x = 1:7 > y = c("Bill","Fred","Diane") > z = matrix(1:9, nrow=3) # clutter up the workspace > ls() # see the mess you've made > save.image("mystuff.RData") # save workspace to working directory > rm(x,y,z) # clear the workspace > ls() # see? all gone! > load("mystuff.RData") # bring it back > ls() log(value or vector) - take the natural log of a number or values in a vector log(value, base) - log to any base log10(value or vector) - base-10 logs > log(8) > log(1:8) > log(1:8, base=2) > log(1:8, base=10) > log10(1:8) loglm(formula, data) - log linear analysis (this function is in the MASS library) > library("MASS") # necessary to get at the function > UCBAdmissions # UC Berkeley grad programs admissions data 1973 > loglm(~ Dept * Admit * Gender, data=UCBAdmissions) > ### this saturated model fits the data perfectly (LR X^2 = 0) > loglm(~ Dept * Admit, data=UCBAdmissions) > ### removing Gender results in a model that does not fit the data (LR X^2 = 1405, df=12, p=0) lowess(x, y) - return the coordinates of a lowess smoother for (x,y)-scatterplot > lowess(women) > plot(women, pch=2) > points(lowess(women)) ls() - list the objects in the workspace; also objects(); character(0)=empty > ls() > ls(pos=2) # show contents of position 2 on search path mad(vector) - get the median absolute deviation from the median mad(vector, center=mean(vector)) - gives a mean-centered MAD > mad(rivers) > mad(rivers, center=mean(rivers)) manova(formula, data) - multivariate analysis of variance > manova.out = manova(cbind(mpg, disp, hp, wt) ~ factor(gear), data=mtcars) > summary.aov(manova.out) > summary.manova(manova.out) > summary.manova(manova.out, test="Wilks") margin.table(contingency_table, 1) - get the row marginal sums margin.table(contingency_table, 2) - get the column marginal sums > eyes = HairEyeColor[,,1] # a two-way contingency table > eyes # see? > margin.table(eyes, margin=1) # row marginal sums > margin.table(eyes, margin=2) # column marginal sums matrix(vector, nrow=value, ncol=value) - create a nrow x ncol matrix from a vector > matrix(1:10, nrow=2) > matrix(1:10, ncol=5) > matrix(1:10, ncol=5, byrow=T) # fill across the rows max(vector) - get maximum value in a numeric variable; na.rm=T to remove missings > max(rivers) mean(vector) - get mean of a numeric variable; chokes on NAs mean(vector, na.rm=T) - mean of variable after removing missing values > mean(rivers) > rivers[200] = NA # insert a missing value > mean(rivers) # doesn't work anymore > mean(rivers, na.rm=T) # there we go! median(vector) - get median of the numeric variable; na.rm=T to remove missings > median(rivers) min(vector) - get the minimum value in a numeric variable; na.rm=T to remove missings > min(rivers) model.tables(aov_model, type="means") - get group means from a _balanced_ ANOVA (only!) > aov.out = aov(len ~ supp * factor(dose), data=ToothGrowth) > model.tables(aov.out, type="means") NA - a missing value (R's missing value code) na.omit(vector or dataframe_name) - remove missings or cases with missing values > women # a data frame > women[10,2] = NA # insert a missing value > women > na.omit(women) # remove cases with NAs > na.omit(women$weight) # works on vectors, too names() - name variables or retrieve the names of variables > names(mtcars) > my.df = data.frame(1:5,6:10,11:15) > names(my.df) = c("first","second","third") # same as colnames() in this case > my.df numeric(size) - create a numeric vector of size size > numeric(20) # filled with zeros by default objects() - see objects in the workspace; same as ls() > objects() # does the same thing as ls() oneway.test(formula, data) - one-way Welch-corrected between subjects ANOVA oneway.test(formula, data, var.equal=T) - one-way betw. ANOVA assuming homogeneity > names(InsectSprays) # count is numeric, spray is a factor > oneway.test(count ~ spray, data=InsectSprays) > oneway.test(count ~ spray, data=InsectSprays, var.eq=T) order() - can be used to reorder the rows in a dataframe, but syntax is twisted and the help page is useless in sorting it out, because it doesn't give a single useful example; here's one > data(women) > women # see how orderly it is > women[sample(1:15),]->women # mix it up > women # see how mixed up it is now > women[order(women$weight),] # sort on the weight variable pairs(all_numeric_dataframe or matrix) - plot a scatterplot matrix > pairs(USArrests) # plot(USArrests) will do the same in this case > pairs(state.x77) # matrix; plot() will not work here pairwise.t.test(x=vector, g=factor, p.adjust.method) - all possible pairwise t-tests > attach(InsectSprays) > names(InsectSprays) # count is numeric, spray is a factor > pairwise.t.test(x=count, g=spray) # default p.adjust is holm > pairwise.t.test(x=count, g=spray, p.ajust="bonf") # Bonferroni adjustments > pairwise.t.test(x=count, g=spray, p.ajust="none") # Fisher LSD tests paste() - one of many string manipulation functions available > paste("subject", as.character(1:10), sep=".") > paste("subject", as.character(1:10), sep="") > paste("William", "Blackstone", "King", sep=" ") pbinom(successes, size, prob, lower.tail=T) - binomial cumulative probability function > pbinom(40, size=100, p=.5) # Pr(40 or fewer tails in 100 coin tosses) > pbinom(40, size=100, p=.5, lower=F) # Pr(more than 40 tails in 100 tosses) > pbinom(40, size=100, p=.5, lower=F) + pbinom(40, size=100, p=.5) pf(F-value, df1, df2, lower.tail=T) - F cumulative distribution function (F p-values) > pf(3.12, df1=3, df2=10) # area in lower tail > pf(3.12, df1=3, df2=10, lower=F) # area in upper tail pi - the only built in mathematical constant in R > pi > r = 10; 2 * pi * r > r = 1:20; 2 * pi * r pie(table_name) - plot a piechart of values in table (must be tabled first) > HairColorFreqs = margin.table(HairEyeColor[,,1], 1) > HairColorFreqs # see what you're dealing with > pie(HairColorFreqs) > pie(HairColorFreqs, col=c("black","brown","red","yellow")) plot(predictor_var, response_var) - scatterplot of two numeric variables plot(response_var ~ predictor_var, data) - formula interface plot(var1, var2, log="y") - plot with a logarithmic y-axis; fails if any y<1 > plot(pressure) # pressure vs. temperature, the two numerics in the data frame > with(pressure, plot(temperature, pressure)) > plot(pressure ~ temperature, data=pressure) > plot(pressure, log="y") > plot(pressure, log="xy") plot(dataframe_name) - plot a matrix of scatterplots of numerical variables (see pairs) > plot(USArrests) # an all numeric data frame (seems to fail with matrices!) plot(density(variable_name)) - kernel density smoother for numerical variable (see density) > plot(density(faithful$waiting)) plot(lm_model) - plot four regression diagnostic plots from stored lm results > lm.out = lm(weight ~ height, data=women) # get an lm_model to work with > par(mfrow=c(2,2)) # set the graphics device to plot 2 rows of 2 graphs > plot(lm.out) plot.new() - open the graphics device for plotting > plot.new() pnorm(z, lower.tail=T) - get the area under the unit normal curve that is (at or) below z > pnorm(1.28) > pnorm(1.28, lower=F) points(vector_of_x-values, vector_of_y-values) - plot points on a graph > plot(1:10, 1:10, type="n") # a blank graph to draw on > points(x=c(2,4,6,8), y=c(6,10,2,4)) > points(x=c(2,4,6,8), y=c(6,10,2,4), type="b") power.anova.test(groups, n, between.var, within.var, sig.level, power) - power for balanced anova > ### one-way balanced between subjects designs only > ### within.var is anticipated pooled variance (i.e., MSE) > ### between.var is anticipated variance of group means (i.e., MSB/n) > DV = c(c(2,5,3,4),c(3,7,5,6),c(8,6,7,4)); IV = gl(3,4) # a pilot study > aov(DV ~ IV) # SSB=15.5 on df=2, SSE=22.5 on df=9 > power.anova.test(groups=3, n=NULL, between.var=15.5/2/4, within.var=22.5/9, + sig.level=.05, power=.8) # n = 7.3 subjects per group for 80% power power.prop.test(n, p1, p2, sig.level, power) - power for two-sample proportions test > p1 = .4 # anticipated proportion in group 1 > p2 = .5 # anticipated proportion in group 2 > power.prop.test(n=NULL, p1=p1, p2=p2, sig.level=.05, power=.8) > ### n = 387.3 subjects per group for 80% power power.t.test(n, delta, sd, sig.level, power) - t-test power function > ### set exactly one of those things equal to null and R will solve for it > power.t.test(n=NULL, delta=3, sd=7, sig.level=.05, power=.8) > ### for delta (anticipated difference in means) = 3, n = 86.4 subjects per group ppois(value, lambda, lower.tail=T) - poisson cumulative probability function > ppois(3, lambda=8) # prob. of 3 or fewer insect legs in a candy bar when average is 8 > ppois(3, lambda=8, lower=F) # prob. of more than 3 when average is 8 predict(lm_model, list(values_to_predict_from)) - make predictions from a regression model > names(mtcars) > lm.out = lm(mpg ~ cyl + hp + wt, data=mtcars) # get a regression model > predict(lm.out, list(cyl=4, hp=120, wt=2)) print(anything) - does what it says > print(women) > print("Hello there!") > print(pi, digits=15) prop.table(table_name) - convert tabled frequencies to proportions of N prop.table(table_name, 1) - convert tabled freqs. to proportions by rows prop.table(table_name, 2) - convert tabled freqs. to proportions by columns > margin.table(UCBAdmissions, margin=c(1,3)) # a 2x5 contingency table > margin.table(UCBAdmissions, margin=c(1,3)) -> inornot > sum(inornot) # total no. of cases in table (4526) > prop.table(inornot) # proportions with respect to 4526 (not very useful) > prop.table(inornot, margin=1) # with respect to row marginal totals > prop.table(inornot, margin=2) # with respect to column marginal totals prop.test(number_of_hits, N, p=null_prob.) - one-proportion test prop.test(vector_of_hits, vector_of_Ns) - multiple-proportion test > addmargins(margin.table(Titanic, margin=c(4,2))) # survival by gender; Titanic > prop.test(1490, n=2201, p=.5) # hypothesis of 50% survival chance rejected > prop.test(c(367, 344), n=c(1731, 470)) # two-proportion test, men vs. women pt(t-value, df, lower.tail=T) - t cumulative probability function (t p-values) > pt(2.05, df=25) > pt(2.05, df=25, lower=F) > pt(2.05, df=25, lower=F) * 2 q() - quit R (same as quit) > q("yes") # save the workspace and quit qnorm(probability_in_lower_tail) - inverse normal distribution > qnorm(.95) # get z-value with 95% of normal dist. (at or) below it > qnorm(.975) qqline(variable_name) - draw the normal line on an existing qqnorm() plot qqnorm(variable_name) - plot a normal probability plot of a numerical variable qqplot(variable1, variable2) - compare distributions of two variables > qqnorm(rivers) > qqline(rivers) > qqplot(rnorm(141), rivers) # compare rivers to random normal dist. > qqplot(rexp(141), rivers) # compare rivers to random exponential dist. quantile(variable_name, probs=vector_of_probs.) - get desired quantiles quantile(variable_name, probs=c(1/4, 3/4), type=2) - get textbook Q1 and Q3 > quantile(rivers, probs=c(.25,.5,.75)) # compare to fivenum(rivers) > quantile(rivers, probs=c(1/4, 3/4), type=2) # usually same result for large vectors > ### see IQR quartz(title, width, height, pointsize) - open a graphics window (Mac only) > quartz("Q", width=5, height=5, pointsize=10) # width, height in inches > plot(pressure) quit() - quit R (same as q) > quit() > quit("no") # quit without saving the workspace range(variable_name) - reports min() and max() (see min and max) > range(rivers) rank(vector) - returns the rank (relative size or position upon sort) of values > rank(c(20, 18, 13, 25, 30, 25)) rbind(vector1, vector2, vector3, ...) - bind vectors into a matrix by rows > X = 1:5; Y = 6:10; Z = 11:15 # some vectors to bind > rbind(X, Y, Z) rbinom(n, size, prob) - generates random binomially distributed values > rbinom(n=5, size=100, prob=.5) # no. of heads in 5 trials of 100 coin tosses read.csv("file_location") - read in a csv formated data file (special case of read.table) > write.csv(women, file="women.csv", row.names=F, quote=F) # create one to read > read.csv(file="women.csv") # read in from your working directory > write.csv(women, file="women.csv", row.names=T, quote=F) > read.csv(file="women.csv", row.names=1) # if rownames are in the first column read.table("file_location", header=T) - read in tabled data with headers > ### same as read.csv() except for white space separated values > ### reads files created by write.table() > ### the value separator can be specified in an option making it very versatile > ### read.table can be used for inline data entry as follows (best done as a script!) > Flintstones = read.table(header=T, text=" + first.name last.name related.to season job voiced.by + Fred Flintstone Fred 1 stone.quarry Alan.Reed + Wilma Flintstone Fred 1 housewife Jean.Vander.Pyl + Pebbles Flintstone Fred 3 child Jean.Vander.Pyl + Dino none Fred 1 pet Mel.Blanc + Barney Rubble Barney 1 unknown Mel.Blanc + Betty Rubble Barney 1 housewife Bea.Benaderet + Bamm.Bamm Rubble Barney 4 child Don.Messick + Hoppy none Barney 5 pet Don.Messick + Mr Slate NA 1 boss John.Stephenson + Arnold NA NA 1 paperboy Don.Messick + ") > summary(Flintstones) # option row.names=1 might have been a good idea rep(data_vector, freq_vector, times or each) - creates a vector with repeating elements > rep(1:5, times=3) > rep(1:5, each=3) > rep(c("Fred","Barney"), each=3) > rep(c("Fred","Barney"), c(5, 2)) relevel(factor_name, ref) - changes the reference level of a factor > data(InsectSprays) > str(InsectSprays) > levels(InsectSprays$spray) > relevel(InsectSprays$spray, ref="F") > relevel(InsectSprays$spray, ref="F") -> InsectSprays$spray > levels(InsectSprays$spray) remove(object_name or names) - same as rm(), see > x = 7 > ls() > remove(x) > ls() require(library_names) - load a library, same as library(), see > require("MASS") # except unlike library(), it gives a little message rm(object_name or names) - remove an object from the workspace (see remove) > rm(list=ls()) # completely clear workspace; DON'T do it unless you really mean it! > x=1:7; y="Fred"; z=c(T,F); xx=matrix(1:6,nrow=3); yy=list(x,y,z) # mess up workspace > rm(x) # remove x > rm(y, z) # remove y and z > rm(list=ls()) # remove everything else rnorm(n, mean, sd) - generates random normally distributed values > rnorm(n=5) # mean=0, sd=1 are defaults > rnorm(n=5, mean=10, sd=5) round(value or vector, digits) - round values to digits decimal places > xyz = rnorm(5) > xyz > round(xyz, digits=3) > round(xyz, digits=0) > round(cor(state.x77), digits=3) # try it without rounding! rowMeans(all_numeric_tablelike_object) - get row means from a table-like object > my.matrix = matrix(runif(16,min=10,max=20), nrow=4) # create such an object > rowMeans(my.matrix) > colMeans(my.matrix) rownames(tablelike_object) - create or get row names (see colnames) > head(state.x77) # a matrix > rownames(state.x77) > rownames(mtcars) # a data frame > rownames(UCBAdmissions) # a contingency table > my.df = data.frame(1:5, 6:10, 11:15) # create a data frame > my.df > colnames(my.df) = c("little","bigger","biggest") > rownames(my.df) = c("Bob.Smith","Fred.Skinner","Lucy.Rich","Stan.TheMan","Susan.Saugt") > my.df runif(n, min, max) - generates random uniformly distributed values > runif(n=5) # min=0, max=1 are defaults > runif(n=5, min=10, max=20) sample(vector, size, replace) - take a random sample from a given vector > sample(1:10, size=3) # sample of 3 without replacement > sample(1:10, size=20, replace=T) # sample of 20 with replacement save(R_object, file="filename") - save a data object to a file in working directory > data(mtcars) # get something to save > ls() > save(mtcars, file="mtcars.rda") # save to working directory (proprietary format) > rm(mtcars) > ls() > load(file="mtcars.rda") # read it back in with load > ls() save.image(file) - save a copy of the workspace to the working directory > save.image() # saves workspace to a file called .RData > save.image(file="mystuff.RData") # saves workspace to a named file (see load) scan(what) - most useful for entering data directly from the keyboard > ### enter these two vectors > ### 12, 16, 22, 10, 23, 18, 15, 17 > ### Fred, Barney, Wilma, Betty, Pebbles, Dino > my.vector = scan() # press Enter 1: 12 16 22 10 23 18 15 17 9: # press Enter again Read 8 items > ### you can also press enter after each data value is entered > ### leave a blank value at the end to terminate entry > Flintstones = scan(what="char") # must say so when entering character data 1: Fred 2: Barney 3: Wilma 4: Betty 5: Pebbles 6: Dino 7: Read 6 items > my.vector > Flintstones scatter.smooth(predictor_var, response_var) - scatterplot with lowess smoother > with(faithful, plot(waiting, eruptions)) # for comparison > with(faithful, scatter.smooth(waiting, eruptions)) sd(vector) - get the sample standard deviation of a numeric variable > sd(rivers) > rivers[200] = NA # uh oh! there's a missing value > sd(rivers) # fails > sd(rivers, na.rm=T) # remove missing values and calculate search() - ask R where it is looking for stuff (the search path) > search() # look at the search path > attach(women) > search() # "women" is now in position 2 > detach(women) > search() # and now it's gone sem=function(x) sd(x)/sqrt(length(x)) - quick and dirty sem function > sem(rivers) # will only work after the function is created > sem=function(x) sqrt(var(x,na.rm=T)/sum(!is.na(x))) # won't choke on NAs seq(from, to, by) - create a regular sequence of numbers > seq(from=1, to=10, by=1) # same as 1:10 > seq(from=10, to=1, by=-1) # same as 10:1 > seq(from=10, to=100, by=10) setwd() - set or change the working dircetory (better to use the menus) > setwd("Rspace") # if you've created this directory shapiro.test(vector) - Shapiro-Wilk test for normal distribution > shapiro.test(rivers) # not even close! sin(angle_in_radians or vector of angles) - sine function > sin(pi/2) > sin(5) > sin(pi/4, pi/2, pi) sort(variable_name) - sort values in ascending order sort(variable_name, decreasing=T) - sort values in descending order sort(unique()) > sort(rivers) # sort in ascending order > sort(rivers, decreasing=T) # sort in descending order > sort(unique(rivers)) # sort just nonrepeats source("location_of_script") - read in and execute a text file of R commands > source(file="myscript.R") # has to exist in your working directory first > file = "http://ww2.coastal.edu/kingw/statistics/R-tutorials/text/myscript.R" > source(file, echo=T) sqrt(number or vector) - take the square root of number or numbers > sqrt(120) > sqrt(1:10) stack(wide_format_dataframe, select) - convert wide to long format data frame > data(anorexia, package="MASS") > anorexia # a wide_format data frame > stack(anorexia, select=c(Prewt,Postwt)) # stack these two columns > ### we would need to add the Treat info and a subject identifier to make this useful stem(vector) - get a stem and leaf display of a numeric variable > stem(rivers) > stem(rivers, scale=.5) # less intervals > stem(rivers, scale=2) # more intervals step(lm_model, direction) - automated forward, backward, or stepwise regression > lm.out = lm(mpg ~ cyl + disp + hp + wt, data=mtcars) > step(lm.out, direction="backward") > step(lm.out, direction="backward") -> lm.out2 > summary(lm.out2) # cyl and hp retained even though not significant; look at AIC to see why str(object_name) - find out the structure of an R object > str(rivers) # a numeric vector > str(USArrests) # a data frame > str(state.x77) # a matrix stripchart(vector or formula, method) > stripchart(rivers, method="jitter", pch=1) # try it without jitter > stripchart(rivers, method="stack") # an alternative to jitter > stripchart(mpg ~ gear, data=mtcars, method="jitter", pch=1) subset(object_name, subset, select) - subset a vector, matrix, or dataframe > head(mtcars) # dats a lotta columns! > fewer.columns = subset(mtcars, select=c(mpg,cyl,hp,wt)) > head(fewer.columns) # that's better > subset(fewer.columns, subset=cyl>4) # just sixes and eights > subset(fewer.columns, subset=(cyl==4 | cyl==8)) # just fours and eights > subset(fewer.columns, subset=hp>200) # muscle cars and roadhogs > subset(rivers, subset=rivers>1000) # works with vectors, too sum(vector_name) - sum the values in a vector; na.rm is available > vector = c(6, 10, 12) > sum(vector) > vector[2] = NA # oh dear! a missing value > sum(vector) # fails > sum(vector, na.rm=T) # sum after removing missing values summary(object_name) - summarize an R object; summary stats from a variable, etc summary(aov_model) - get the anova summary table from an anova summary(lm_model) - get a detailed summary of stored linear regression results > summary(chickwts$weight) # summary of a numeric variable > summary(chickwts$feed) # summary of a factor (a frequency table) > summary(chickwts) # summary of a data frame > summary(state.x77) # summary of a matrix > aov.out = aov(weight ~ feed, data=chickwts) > summary(aov.out) # summary of an aov_model > lm.out = lm(weight ~ height, data=women) > summary(lm.out) # summary of an lm_model t(matrix) - transpose a matrix > my.matrix = matrix(1:15, nrow=3) > my.matrix > t(my.matrix) > t(my.matrix) %*% my.matrix # matrix multiplication table(vector) - create a 1D frequency table of a variable table(vector1, vector2, ...) - create a contingency table with vector1 in rows > data(birthwt) # get a data frame with lots of factors > str(birthwt) # but they're not declared as factors > for (i in 4:9) birthwt[,i]=factor(birthwt[,i]) # call a factor a factor! > str(birthwt) # better! > table(birthwt$ftv) > table(birthwt$race) > table(birthwt$smoke, birthwt$race) > xtabs(~ smoke + race, data=birthwt) # same result with better labels > table(birthwt$smoke, birthwt$race, birthwt$ht) > xtabs(~ smoke + race + ht, data=birthwt) tan(angle_in_radians or vector thereof) - tangent function > tan(pi/3) > tan(seq(from=0, to=2*pi, by=pi/4)) tapply(DV, IV, function) - calculate a function of a DV at each level of an IV (see by) > summary(mtcars) > attach(mtcars) # easiest as there is no formula interface > tapply(mpg, factor(am), mean) # group means > tapply(mpg, factor(am), var) # group variances > tapply(mpg, am, sd) # it doesn't really have to be a factor > tapply(mpg, list(am,gear), mean) # or a two-way classification > detach(mtcars) # don't leave it attached when done t.test(vector, mu=null_mean) - single-sample t test t.test(vector, mu=null_mean, alternative="greater" or "less") - one-sided test t.test(vector1, vector2) - two-sample t test with Welch correction t.test(vector1, vector2, var.equal=T) - no Welch correction (pooled variance t test) t.test(vector1, vector2, alternative="greater" or "less") - one-sided test (gp1 - gp2) t.test(formula, data) - two-sample t test using formula interface (from a dataframe) t.test(... , paired=T) - related samples t test > mpg = mtcars$mpg # a vector of automobile gas mileages > t.test(mpg, mu=20) # H0: mu = 20 > t.test(mpg, mu=20, alternative="greater") # H0: mu > 20 > mpg.4speeds = mtcars$mpg[mtcars$gear==4] # a vector: mpg for 4-speeds > mpg.5speeds = mtcars$mpg[mtcars$gear==5] # a vector: mpg for 5-speeds > t.test(mpg.4speeds, mpg.5speeds) # Welch-corrected t-test > t.test(mpg.4speeds, mpg.5speeds, var.eq=T) # pooled-variance t-test > t.test(mpg.4speeds, mpg.5speeds, var.eq=T, alternative="greater") # directional > data(mtcars) # a data frame > mtcars$am = factor(mtcars$am, labels=c("yes","no")) # automatic trans (yes/no) > t.test(mpg ~ am, data=mtcars) # Welch-corrected t-test > t.test(mpg ~ am, data=mtcars, var.eq=T) # pooled-variance t-test > t.test(mpg ~ am, data=mtcars, var.eq=T, alternative="less") # directional > data(sleep) # a data frame with paired scores > t.test(sleep$extra[1:10], sleep$extra[11:20], paired=T) > t.test(extra ~ group, data=sleep, paired=T) > t.test(extra ~ group, data=sleep, paired=T, alternative="greater") # wrong way! > sleep.wide = data.frame(sleep$extra[1:10], sleep$extra[11:20]) # more sensibly tabled! > colnames(sleep.wide) = c("control","drug") > sleep.wide > with(sleep.wide, t.test(control, drug, paired=T)) text(x-coord, y-coord, "string of text") - add text to an existing plot > plot(pressure) # get a graph to write on > text(x=200, y=600, "text is centered at coords") title(main, xlab, ylab) - add titles and axis labels to an existing plot > plot(pressure, xlab="", ylab="", main="") # graph with no labels > title(main="The Main Title") > title(xlab="This is the x-axis") > title(ylab="This is the y-axis", cex.lab=1.3) TukeyHSD(aov_model, which) - Tukey HSD post hoc test on an anova model > aov.out = aov(count ~ spray, data=InsectSprays) # get an aov_model to work with > TukeyHSD(aov.out) > aov.out = aov(len ~ supp * factor(dose), data=ToothGrowth) # a factorial > TukeyHSD(aov.out, which="factor(dose)") # Tukey on dose > ### don't forget the quotes in which; I always do because they seem so unnecessary unique(vector) - get the unique values in a vector > unique(c(1,1,1,1,1,1,2,3,2,1,1,1,1)) > vector = sample(c("Fred","Barney"), size=50, replace=T) > unique(vector) update(lm_model, ~.-variable_name) - update an lm_model by deleting a variable > names(mtcars) > lm.out = lm(mpg ~ cyl + disp + hp + wt, data=mtcars) > summary(lm.out) > lm.out2 = update(lm.out, ~.-disp) > summary(lm.out2) > lm.out3 = update(lm.out2, ~.+am) # variables can also be added > summary(lm.out3) var(vector) - get the sample variance of a numeric variable > var(rivers) > var(USArrests) # variance-covariance matrix (same as cov in this case) var.test(vector1, vector2) - test for a significant difference betw. variances > var.test(rivers[1:70], rivers[71:140]) # test 1st half of rivers vector against 2nd half > var(cbind(rivers[1:70], rivers[71:140])) # check the variance-covariance matrix > summary(warpbreaks) # a data frame > var.test(breaks ~ wool, data=warpbreaks) # formula interface vector_name[i] - get the value at position i of the vector vector_name[i:j] - get values at positions i through j of the vector > rivers[100] > rivers[100:120] weighted.mean()vector, w=weights) > weighted.mean(c(12, 15, 10, 8, 10)) # gives ordinary mean if no weights given > weighted.mean(c(12, 15, 10, 8, 10), w=c(23, 25, 20, 21, 20)) # perhaps an unbalanced design > grades = c(3, 2.5, 3, 4, 4, 3.5) # calculation of GPA (B, C+, B, A, A, B+) > course.credits = c(1, 3, 3, 2, 1, 3) > weighted.mean(grades, w=course.credits) # compare to mean(grades) which(vector .test. value) - find out which values pass the test > ### .test. may be < (less than), > (greater than), == (equals), != (not equal) > which(rivers > 1000) # returns case numbers that pass the test > rivers[which(rivers>1000)] # see the actual values in the vector > rivers[rivers>1000] # same result as last one > which(rivers == 1000) > which(rivers < 200) > which(rivers > 500 & rivers < 1000) # and > which(rivers < 500 | rivers > 1000) # or wilcox.test(formula, data) - commonly called the Mann-Whitney U test, for independent groups wilcox.test(formula, data, paired=T) - Wilcoxin test for paired scores > wilcox.test(extra ~ group, data=sleep) > wilcox.test(extra ~ group, data=sleep, paired=T) # in fact, these scores are paired windows(width, height, pointsize) - open a graphics window (Windows only; see quartz for Mac) > windows(width=5, height=5, pointsize=10) # width and height in inches with(dataframe_name, function) - specify dataframe to be used with a function > with(women, mean(weight)) # mean of weight variable in women data frame > mean(women$weight) # accomplishes the same thing > attach(women); mean(weight); detach(women) # if you really want to work for it! write.csv(dataframe_name, file) - write out a data frame as a CSV file > women # a data frame to write out > write.csv(women, file="women.csv", row.names=F, quote=F) > ### it's now in your working directory as women.csv without the rownames and text unquoted > ### use quote=T if you've been foolish enough to put blank spaces in text items > ### use row.names=T if you have meaningful rownames xtabs(formula, data) - crosstabulation > X = sample(c("Wilma","Betty"), size=50, replace=T) # some data to crosstabulate > Y = sample(c("Fred","Barney"), size=50, replace=T) > xtabs(~ X + Y) > df = as.data.frame.table(UCBAdmissions) # a data frame with a Freq column > head(df) > xtabs(Freq ~ Admit + Gender, data=df) created 2016 February 23 |