Computational Biology 381

Homework 4 part A: Metadata

#Preliminaries:
# Clear memory of characters
ls()
## character(0)
rm(list=ls())

# set working dirrectory 
setwd("~/ComputationalBiology/CompBio")

# read in data
Data <- read.table("RawData/hbdata.csv",header=TRUE,sep=",",stringsAsFactors=FALSE)
str(Data)
## 'data.frame':    172 obs. of  18 variables:
##  $ FieldID   : chr  "Y1L1" "Y1L2" "Y1L3" "Y1L4" ...
##  $ Time      : chr  "T1" "T1" "T1" "T1" ...
##  $ Origin    : chr  "Local" "Local" "Local" "Local" ...
##  $ Yard      : chr  "Yard1" "Yard1" "Yard1" "Yard1" ...
##  $ Mass      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Nosema    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Varroa    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Brood     : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ NosemaPA  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ VarroaPA  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ MassDay   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ NosemaDay : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ VarroaDay : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ BroodDay  : int  -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 ...
##  $ MassDate  : chr  NA NA NA NA ...
##  $ NosemaDate: chr  NA NA NA NA ...
##  $ VarroaDate: chr  NA NA NA NA ...
##  $ BroodDate : chr  "5/30/16" "5/30/16" "5/30/16" "5/30/16" ...
table(Data$Origin)
## 
## California      Local 
##         79         93
head(Data)
##   FieldID Time Origin  Yard Mass Nosema Varroa Brood NosemaPA VarroaPA
## 1    Y1L1   T1  Local Yard1   NA     NA     NA     4       NA       NA
## 2    Y1L2   T1  Local Yard1   NA     NA     NA     4       NA       NA
## 3    Y1L3   T1  Local Yard1   NA     NA     NA     4       NA       NA
## 4    Y1L4   T1  Local Yard1   NA     NA     NA     4       NA       NA
## 5    Y1L5   T1  Local Yard1   NA     NA     NA     4       NA       NA
## 6    Y1L6   T1  Local Yard1   NA     NA     NA     4       NA       NA
##   MassDay NosemaDay VarroaDay BroodDay MassDate NosemaDate VarroaDate
## 1      NA        NA        NA       -4     <NA>       <NA>       <NA>
## 2      NA        NA        NA       -4     <NA>       <NA>       <NA>
## 3      NA        NA        NA       -4     <NA>       <NA>       <NA>
## 4      NA        NA        NA       -4     <NA>       <NA>       <NA>
## 5      NA        NA        NA       -4     <NA>       <NA>       <NA>
## 6      NA        NA        NA       -4     <NA>       <NA>       <NA>
##   BroodDate
## 1   5/30/16
## 2   5/30/16
## 3   5/30/16
## 4   5/30/16
## 5   5/30/16
## 6   5/30/16
tail(Data)
##     FieldID Time     Origin  Yard Mass  Nosema Varroa Brood NosemaPA
## 167    Y1C9   T5 California Yard1 23.2 3100000      0     4       NA
## 168    Y2C1   T5 California Yard2 35.8 3275000      2     5       NA
## 169    Y2C3   T5 California Yard2 61.6 2550000      1     5       NA
## 170    Y2C4   T5 California Yard2 39.2 1300000      4     1       NA
## 171    Y2C7   T5 California Yard2 37.4 2300000      1     5       NA
## 172   Y2C10   T5 California Yard2 76.0 2200000      0     6       NA
##     VarroaPA MassDay NosemaDay VarroaDay BroodDay MassDate NosemaDate
## 167        0      67        67        61       61   8/9/16     8/9/16
## 168        1      67        67        61       61   8/9/16     8/9/16
## 169        1      67        67        61       61   8/9/16     8/9/16
## 170        1      67        67        61       61   8/9/16     8/9/16
## 171        1      67        67        61       61   8/9/16     8/9/16
## 172        0      67        67        61       61   8/9/16     8/9/16
##     VarroaDate BroodDate
## 167     8/3/16    8/3/16
## 168     8/3/16    8/3/16
## 169     8/3/16    8/3/16
## 170     8/3/16    8/3/16
## 171     8/3/16    8/3/16
## 172     8/3/16    8/3/16
summary(Data)
##    FieldID              Time              Origin         
##  Length:172         Length:172         Length:172        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      Yard                Mass          Nosema             Varroa      
##  Length:172         Min.   :12.8   Min.   :       0   Min.   : 0.000  
##  Class :character   1st Qu.:24.4   1st Qu.:  100000   1st Qu.: 1.000  
##  Mode  :character   Median :31.6   Median :  762500   Median : 1.000  
##                     Mean   :36.2   Mean   : 1492708   Mean   : 3.376  
##                     3rd Qu.:40.6   3rd Qu.: 1943750   3rd Qu.: 4.000  
##                     Max.   :91.4   Max.   :10075000   Max.   :36.000  
##                     NA's   :45     NA's   :76         NA's   :47      
##      Brood         NosemaPA         VarroaPA        MassDay     
##  Min.   :0.00   Min.   :0.0000   Min.   :0.000   Min.   :11.00  
##  1st Qu.:4.00   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:11.00  
##  Median :4.00   Median :1.0000   Median :1.000   Median :34.00  
##  Mean   :4.17   Mean   :0.9143   Mean   :0.816   Mean   :37.99  
##  3rd Qu.:5.00   3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:53.00  
##  Max.   :8.00   Max.   :1.0000   Max.   :1.000   Max.   :67.00  
##  NA's   :1      NA's   :102      NA's   :47      NA's   :40     
##    NosemaDay       VarroaDay        BroodDay       MassDate        
##  Min.   :13.00   Min.   :12.00   Min.   :-4.00   Length:172        
##  1st Qu.:13.00   1st Qu.:12.00   1st Qu.:12.00   Class :character  
##  Median :46.00   Median :24.00   Median :24.00   Mode  :character  
##  Mean   :38.02   Mean   :33.05   Mean   :23.17                     
##  3rd Qu.:67.00   3rd Qu.:47.00   3rd Qu.:40.00                     
##  Max.   :67.00   Max.   :61.00   Max.   :61.00                     
##  NA's   :75      NA's   :40                                        
##   NosemaDate         VarroaDate         BroodDate        
##  Length:172         Length:172         Length:172        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 

Homework 4 part B: Regular Expressions:

Solutions to the problems:

1)

Find: \s{2,}

Replace: , 

2)

Find: (\w+),\s(\w+),\s(.*)

Replace: \2 \1 (\3)

3a)

Find: [.mp3]{4}\s+

Replace: .mp3\n

3b)

Find: (\d+)\s(.*)([.mp3]{4})

Replace: \2_\1\3

4a)

Find:([A-Z])(\w*),(\w*),(.*,)(\d+)

Replace: \1_\3,\5

4b)

Find:([A-Z])(\w*),([\w]{4})(.*,)(.*,)(\d+)

Replace: \1_\3,\6