Looking at 4 main analyses:

Regression

# data
xVar <- 1:10
yVar <- runif(10)
dataFrame <- data.frame(xVar,yVar)

# model
regModel <- lm(yVar~xVar,data=dataFrame)

# model output
print(regModel)

## 
## Call:
## lm(formula = yVar ~ xVar, data = dataFrame)
## 
## Coefficients:
## (Intercept)         xVar  
##    0.709479    -0.003963

print(summary(regModel))

## 
## Call:
## lm(formula = yVar ~ xVar, data = dataFrame)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28713 -0.07299  0.01080  0.07893  0.26695 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.709479   0.113304   6.262 0.000243 ***
## xVar        -0.003963   0.018261  -0.217 0.833615    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1659 on 8 degrees of freedom
## Multiple R-squared:  0.005853,   Adjusted R-squared:  -0.1184 
## F-statistic: 0.0471 on 1 and 8 DF,  p-value: 0.8336

# plot
plot(y=dataFrame$yVar,x=dataFrame$xVar,pch=21,bg="lightblue",cex=2)
abline(regModel)

ANOVA

# data
xVar <- as.factor(rep(c("Control","Heated","Cooled"),each=5))
yVar <- c(rgamma(10,shape=5,scale=5),rgamma(5,shape=5,scale=10))
dataFrame <- data.frame(xVar,yVar)

# model
anovaModel <- aov(yVar~xVar,data=dataFrame)

# model output
print(anovaModel)

## Call:
##    aov(formula = yVar ~ xVar, data = dataFrame)
## 
## Terms:
##                     xVar Residuals
## Sum of Squares  2981.435  3202.527
## Deg. of Freedom        2        12
## 
## Residual standard error: 16.33638
## Estimated effects may be unbalanced

summary(anovaModel)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## xVar         2   2981  1490.7   5.586 0.0193 *
## Residuals   12   3203   266.9                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# plot
boxplot(yVar~xVar,data=dataFrame,col=c("grey","thistle","orchid"))

CONTINGENCY TABLE ANALYSIS

# data
vec1 <- c(50,66,22)
vec2 <- c(120,22,30)
dataMatrix <- rbind(vec1,vec2)
rownames(dataMatrix) <- c("Cold","Warm")
colnames(dataMatrix) <-c("Aphaenogaster",
                         "Camponotus",
                         "Crematogaster")



# model + model output
print(chisq.test(dataMatrix))

## 
##  Pearson's Chi-squared test
## 
## data:  dataMatrix
## X-squared = 48.914, df = 2, p-value = 2.391e-11

# plot
mosaicplot(x=dataMatrix,
           col=c("goldenrod","grey","black"),
           shade=FALSE)

barplot(height=dataMatrix,
        beside=TRUE,
        col=c("cornflowerblue","tomato"))

LOGISTIC REGRESSION

# data
xVar <- rgamma(n=20,shape=5,scale=5)
yVar <- rbinom(n=20,size=1,p=0.5)
dataFrame <- data.frame(xVar,yVar)

# model
logRegMod <- glm(yVar ~ xVar,
                 data=dataFrame,
                 family=binomial(link="logit"))
# model output
print(logRegMod)

## 
## Call:  glm(formula = yVar ~ xVar, family = binomial(link = "logit"), 
##     data = dataFrame)
## 
## Coefficients:
## (Intercept)         xVar  
##     1.70227     -0.06478  
## 
## Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
## Null Deviance:       27.53 
## Residual Deviance: 25.94     AIC: 29.94

summary(logRegMod)

## 
## Call:
## glm(formula = yVar ~ xVar, family = binomial(link = "logit"), 
##     data = dataFrame)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4925  -1.1161   0.7621   0.9819   1.8633  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.70227    1.35767   1.254    0.210
## xVar        -0.06478    0.05540  -1.169    0.242
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.526  on 19  degrees of freedom
## Residual deviance: 25.936  on 18  degrees of freedom
## AIC: 29.936
## 
## Number of Fisher Scoring iterations: 4

# plot
plot(x=dataFrame$xVar, y=dataFrame$yVar,pch=21,bg="tan",cex=2.5)
curve(predict(logRegMod,data.frame(xVar=x),type="response"),add=TRUE,lwd=2)

HOMEWORK:

1) Set up a new markdown file for this homework. For each of the 4 models create a function to run the model. You will need to think carefully about the formal parameters for the input, the default values, and the output from your function. The output should just include model results, not any graphics.

create the function in a single R chunk
immediately after the function run it with its default values
create a tiny fake data set (use some of the random number functions you now know about) and run your code on the fake data.

FAKE DATA SET

# Create fake Data Set:
Pop1 <- runif(20)
Pop2 <- runif(20)
Factor1 <- c(rep("A", 10), rep("B", 10))
Factor2 <- as.factor(c(rep(1, 10), rep(0, 10)))
MyDF <- data.frame(Pop1, Pop2, Factor1, Factor2)

REGRESSION:

###########################################################################
# Function for a simple linear regression (takes cont. y and x variables)
###########################################################################

Regression <- function(x=1:10, y=runif(10)){
  regModel <- lm(y~x)
  
  z <- list(regModel, summary(regModel))
return(z)
}

###########################################################################
# END OF FUNCITON
###########################################################################

Test Regression:

# run function on default values
Regression()

## [[1]]
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##     0.88550     -0.04699  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35746 -0.26297 -0.02917  0.25538  0.43328 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.88550    0.22262   3.978  0.00408 **
## x           -0.04699    0.03588  -1.310  0.22667   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3259 on 8 degrees of freedom
## Multiple R-squared:  0.1765, Adjusted R-squared:  0.07361 
## F-statistic: 1.715 on 1 and 8 DF,  p-value: 0.2267

# Test Function with fake Data Set:
Regression(x=MyDF$Pop1, y=MyDF$Pop2)

## [[1]]
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      0.4757       0.1111  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44392 -0.21719 -0.06129  0.25113  0.45066 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   0.4757     0.1653   2.877    0.010 *
## x             0.1111     0.2798   0.397    0.696  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2825 on 18 degrees of freedom
## Multiple R-squared:  0.008678,   Adjusted R-squared:  -0.0464 
## F-statistic: 0.1576 on 1 and 18 DF,  p-value: 0.6961

ANOVA:

###########################################################################
######### Function for a simple ANOVA (takes cont. y and disc. x variables)
###########################################################################

myANOVA <- function(x=as.factor(c(rep("cat", 5), rep("dog", 5))), y=runif(10)){
  AnModel <- aov(y~x)
  
  z <- list(AnModel, summary(AnModel))
return(z)
}

###########################################################################
# END OF FUNCITON
###########################################################################

Test ANOVA:

# run function on default values
myANOVA()

## [[1]]
## Call:
##    aov(formula = y ~ x)
## 
## Terms:
##                         x Residuals
## Sum of Squares  0.0268161 0.8287665
## Deg. of Freedom         1         8
## 
## Residual standard error: 0.321863
## Estimated effects may be unbalanced
## 
## [[2]]
##             Df Sum Sq Mean Sq F value Pr(>F)
## x            1 0.0268 0.02682   0.259  0.625
## Residuals    8 0.8288 0.10360

# Test Function with fake Data Set:
myANOVA(x=MyDF$Factor1, y=MyDF$Pop2)

## [[1]]
## Call:
##    aov(formula = y ~ x)
## 
## Terms:
##                         x Residuals
## Sum of Squares  0.0248241 1.4247597
## Deg. of Freedom         1        18
## 
## Residual standard error: 0.281342
## Estimated effects may be unbalanced
## 
## [[2]]
##             Df Sum Sq Mean Sq F value Pr(>F)
## x            1 0.0248 0.02482   0.314  0.582
## Residuals   18 1.4248 0.07915

CONTINGENCY:

###########################################################################
# Function for a contingency anlaysis (takes disc. y and disc. x variables)
###########################################################################

myCONT <- function(x=as.factor(c(rep("cat", 3), rep("dog", 17))), y=as.factor(c(rep("0", 10), rep("1", 10)))){
  dataMatrix <- rbind(x,y)
  chiModel <- chisq.test(dataMatrix)
  
  z <- list(chiModel, summary(chiModel))
return(z)
}

###########################################################################
# END OF FUNCITON
###########################################################################

Test Contingency:

# run function on default values
myCONT()

## Warning in chisq.test(dataMatrix): Chi-squared approximation may be
## incorrect

## [[1]]
## 
##  Pearson's Chi-squared test
## 
## data:  dataMatrix
## X-squared = 1.6197, df = 19, p-value = 1
## 
## 
## [[2]]
##           Length Class  Mode     
## statistic  1     -none- numeric  
## parameter  1     -none- numeric  
## p.value    1     -none- numeric  
## method     1     -none- character
## data.name  1     -none- character
## observed  40     -none- numeric  
## expected  40     -none- numeric  
## residuals 40     -none- numeric  
## stdres    40     -none- numeric

# Test Function with fake Data Set:
myCONT(x=MyDF$Factor1, y=MyDF$Factor2)

## Warning in chisq.test(dataMatrix): Chi-squared approximation may be
## incorrect

## [[1]]
## 
##  Pearson's Chi-squared test
## 
## data:  dataMatrix
## X-squared = 6.6667, df = 19, p-value = 0.9958
## 
## 
## [[2]]
##           Length Class  Mode     
## statistic  1     -none- numeric  
## parameter  1     -none- numeric  
## p.value    1     -none- numeric  
## method     1     -none- character
## data.name  1     -none- character
## observed  40     -none- numeric  
## expected  40     -none- numeric  
## residuals 40     -none- numeric  
## stdres    40     -none- numeric

LOG REG:

###########################################################################
# Function for a logistic regression (takes disc. y and cont. x variables)
###########################################################################

LogReg <- function(x=runif(20), y=as.factor(c(rep("0", 10), rep("1", 10)))){
  logRegMod <- glm(y~x,family=binomial(link="logit"))
  
  z <- list(logRegMod, summary(logRegMod))
return(z)
}

###########################################################################
# END OF FUNCITON
###########################################################################

Test Log Reg.:

# run function on default values
LogReg()

## [[1]]
## 
## Call:  glm(formula = y ~ x, family = binomial(link = "logit"))
## 
## Coefficients:
## (Intercept)            x  
##     -0.3647       0.8009  
## 
## Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
## Null Deviance:       27.73 
## Residual Deviance: 27.37     AIC: 31.37
## 
## [[2]]
## 
## Call:
## glm(formula = y ~ x, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.36322  -1.13423  -0.02011   1.21856   1.31599  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -0.3647     0.7634  -0.478    0.633
## x             0.8009     1.3567   0.590    0.555
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 27.372  on 18  degrees of freedom
## AIC: 31.372
## 
## Number of Fisher Scoring iterations: 4

# Test Function with fake Data Set:
LogReg(x=MyDF$Pop1, y=MyDF$Factor2)

## [[1]]
## 
## Call:  glm(formula = y ~ x, family = binomial(link = "logit"))
## 
## Coefficients:
## (Intercept)            x  
##      0.8077      -1.4757  
## 
## Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
## Null Deviance:       27.73 
## Residual Deviance: 27.19     AIC: 31.19
## 
## [[2]]
## 
## Call:
## glm(formula = y ~ x, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.33671  -1.12504  -0.07615   1.15935   1.41314  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   0.8077     1.2166   0.664    0.507
## x            -1.4757     2.0531  -0.719    0.472
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 27.190  on 18  degrees of freedom
## AIC: 31.19
## 
## Number of Fisher Scoring iterations: 4

2) Now, for each of the 4 statistical models, write a graphics function that will generate a nice plot of the results. The formal parameters for input to your graphics function should be the same as the input for your corresponding stats function. Again, illustrate the graphics function for your default settings and for the tiny fake data set you created in 1.

REGRESSION PLOT:

###########################################################################
# function that plots data from regression (takes cont. y and x variables):
###########################################################################

RegressionPlot <- function(x=1:10, y=runif(10)){
  
  p <- plot(y=y,x=x,pch=21,bg="lightblue",cex=2, main="REGRESSION PLOT")
  regModel <- lm(y~x)
  abline(regModel)
  
return(p)
}

###########################################################################
# END OF FUNCITON
###########################################################################

Test Regression Plot:

# run the function with stock values:
RegressionPlot()

## NULL

# Test Function with fake Data Set:
RegressionPlot(x=MyDF$Pop1, y=MyDF$Pop2)

## NULL

ANOVA PLOT:

###########################################################################
# Function that plots box plots from ANOVA (takes cont. y and x variables)
###########################################################################

myANOVAplot <- function(x=as.factor(c(rep("cat", 5), rep("dog", 5))), y=runif(10)){
  
  p <- boxplot(y~x, main="ANOVA PLOT")
  
return(p)
}

###########################################################################
# END OF FUNCITON
###########################################################################

Test ANOVA Plot:

# run function on default values
myANOVAplot()

## $stats
##             [,1]       [,2]
## [1,] 0.007710193 0.01264113
## [2,] 0.027367512 0.11695454
## [3,] 0.248532671 0.19298769
## [4,] 0.723226626 0.58424330
## [5,] 0.885160061 0.86299149
## 
## $n
## [1] 5 5
## 
## $conf
##            [,1]       [,2]
## [1,] -0.2431596 -0.1371974
## [2,]  0.7402250  0.5231728
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] "cat" "dog"

# Test Function with fake Data Set:
myANOVAplot(x=MyDF$Factor1, y=MyDF$Pop2)

## $stats
##            [,1]      [,2]
## [1,] 0.08352479 0.1248698
## [2,] 0.35611704 0.2963937
## [3,] 0.57059685 0.4195975
## [4,] 0.77422063 0.7734556
## [5,] 0.96052560 0.9747175
## 
## $n
## [1] 10 10
## 
## $conf
##           [,1]      [,2]
## [1,] 0.3616956 0.1812383
## [2,] 0.7794981 0.6579566
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] "A" "B"

CONTINGENCY PLOT:

###########################################################################
#### Function for a contingency plot (takes disc. y and disc. x variables)
###########################################################################

myCONTplot <- function(x=as.factor(c(rep("cat", 10), rep("dog", 10))), y=as.factor(c(rep("0", 10), rep("1", 10)))){
  dataMatrix <- rbind(x,y)
 p <- mosaicplot(x=dataMatrix,
           shade=FALSE)

return(p)
}

###########################################################################
# END OF FUNCITON
###########################################################################

Test Contingency Plot:

# run function on default values
myCONTplot()

## NULL

# Test Function with fake Data Set:
myCONTplot(x=MyDF$Factor1, y=MyDF$Factor2)

## NULL

LOG REG PLOT:

###########################################################################
### Function for a logistic reg. plot (takes disc. y and cont. x variables)
###########################################################################

LogRegplot <- function(x=runif(20), y=as.factor(c(rep("0", 10), rep("1", 10)))){
  
  logRegMod <- glm(y~x,family=binomial(link="logit"))
  plot(x=x, y=y,pch=21,bg="tan",cex=2.5)
curve(predict(logRegMod,data.frame(x),type="response"),add=TRUE,lwd=2)
return()
}

###########################################################################
# END OF FUNCITON
###########################################################################

Test Log Reg. Plot:

# run function on default values
LogRegplot()

## NULL

# Test Function with fake Data Set:
LogRegplot(x=MyDF$Pop1, y=MyDF$Factor1)

## NULL

Computational Bio. - Homework 8

P. Alexander Burnham

March 8, 2017