Introduction to data analysis with R and Bioconductor

Using The Carpentries theme — Site last built on: 2022-06-21 10:48:04 +0000. · statistics

species_id,genus,species,taxa
AB,Amphispiza,bilineata,Bird
AH,Ammospermophilus,harrisi,Rodent, not censused
AS,Ammodramus,savannarum,Bird
BA,Baiomys,taylori,Rodent

species_id,genus,species,taxa
"AB","Amphispiza","bilineata","Bird"
"AH","Ammospermophilus","harrisi","Rodent, not censused"
"AS","Ammodramus","savannarum","Bird"
"BA","Baiomys","taylori","Rodent"

bioc-intro/data/
          /fig_output/fig1.pdf
          /fig_output/fig2.png

?barplot

args(lm)

??kruskal

## iris is an example data frame that comes with R and head() is a
## function that returns the first part of the data frame
dput(head(iris))

structure(list(Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5, 5.4), 
    Sepal.Width = c(3.5, 3, 3.2, 3.1, 3.6, 3.9), Petal.Length = c(1.4, 
    1.4, 1.3, 1.5, 1.4, 1.7), Petal.Width = c(0.2, 0.2, 0.2, 
    0.2, 0.2, 0.4), Species = structure(c(1L, 1L, 1L, 1L, 1L, 
    1L), .Label = c("setosa", "versicolor", "virginica"), class = "factor")), row.names = c(NA, 
6L), class = "data.frame")

saveRDS(iris, file="/tmp/iris.rds")

some_data <- readRDS(file="~/Downloads/iris.rds")

sessionInfo()

R version 4.1.3 (2022-03-10)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Monterey 12.2.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.1-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.1-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] knitr_1.39

loaded via a namespace (and not attached):
[1] compiler_4.1.3 magrittr_2.0.3 tools_4.1.3    stringi_1.7.6  highr_0.9     
[6] stringr_1.4.0  xfun_0.31      evaluate_0.15 

library("ggplot2")

install.packages("dplyr")

install.packages("BiocManager")

BiocManager::install("SummarizedExperiment")
BiocManager::install("DESeq2")

3 + 5

[1] 8

12 / 7

[1] 1.714286

weight_kg <- 55

weight_kg <- 55    # doesn't print anything
(weight_kg <- 55)  # but putting parenthesis around the call prints the value of `weight_kg`

[1] 55

weight_kg          # and so does typing the name of the object

[1] 55

2.2 * weight_kg

[1] 121

weight_kg <- 57.5
2.2 * weight_kg

[1] 126.5

weight_lb <- 2.2 * weight_kg

weight_kg <- 100

mass <- 47.5            # mass?
age  <- 122             # age?
mass <- mass * 2.0      # mass?
age  <- age - 20        # age?
mass_index <- mass/age  # mass_index?

b <- sqrt(a)

round(3.14159)

[1] 3

args(round)

function (x, digits = 0) 
NULL

?round

round(3.14159, digits = 2)

[1] 3.14

round(3.14159, 2)

[1] 3.14

round(digits = 2, x = 3.14159)

[1] 3.14

weight_g <- c(50, 60, 65, 82)
weight_g

[1] 50 60 65 82

molecules <- c("dna", "rna", "protein")
molecules

[1] "dna"     "rna"     "protein"

length(weight_g)

[1] 4

length(molecules)

[1] 3

class(weight_g)

[1] "numeric"

class(molecules)

[1] "character"

str(weight_g)

 num [1:4] 50 60 65 82

str(molecules)

 chr [1:3] "dna" "rna" "protein"

weight_g <- c(weight_g, 90) # add to the end of the vector
weight_g <- c(30, weight_g) # add to the beginning of the vector
weight_g

[1] 30 50 60 65 82 90

num_char <- c(1, 2, 3, "a")
num_logical <- c(1, 2, 3, TRUE)
char_logical <- c("a", "b", "c", TRUE)
tricky <- c(1, 2, 3, "4")

class(num_char)

[1] "character"

class(num_logical)

[1] "numeric"

class(char_logical)

[1] "character"

class(tricky)

[1] "character"

num_logical <- c(1, 2, 3, TRUE)
char_logical <- c("a", "b", "c", TRUE)
combined_logical <- c(num_logical, char_logical)

molecules <- c("dna", "rna", "peptide", "protein")
molecules[2]

[1] "rna"

molecules[c(3, 2)]

[1] "peptide" "rna"    

more_molecules <- molecules[c(1, 2, 3, 2, 1, 4)]
more_molecules

[1] "dna"     "rna"     "peptide" "rna"     "dna"     "protein"

molecules ## all molecules

[1] "dna"     "rna"     "peptide" "protein"

molecules[-1] ## all but the first one

[1] "rna"     "peptide" "protein"

molecules[-c(1, 3)] ## all but 1st/3rd ones

[1] "rna"     "protein"

molecules[c(-1, -3)] ## all but 1st/3rd ones

[1] "rna"     "protein"

weight_g <- c(21, 34, 39, 54, 55)
weight_g[c(TRUE, FALSE, TRUE, TRUE, FALSE)]

[1] 21 39 54

## will return logicals with TRUE for the indices that meet
## the condition
weight_g > 50

[1] FALSE FALSE FALSE  TRUE  TRUE

## so we can use this to select only the values above 50
weight_g[weight_g > 50]

[1] 54 55

weight_g[weight_g < 30 | weight_g > 50]

[1] 21 54 55

weight_g[weight_g >= 30 & weight_g == 21]

numeric(0)

molecules <- c("dna", "rna", "protein", "peptide")
molecules[molecules == "rna" | molecules == "dna"] # returns both rna and dna

[1] "dna" "rna"

molecules %in% c("rna", "dna", "metabolite", "peptide", "glycerol")

[1]  TRUE  TRUE FALSE  TRUE

molecules[molecules %in% c("rna", "dna", "metabolite", "peptide", "glycerol")]

[1] "dna"     "rna"     "peptide"

"four" > "five"

[1] TRUE

x <- c(1, 5, 3, 5, 10)
names(x) ## no names

NULL

names(x) <- c("A", "B", "C", "D", "E")
names(x) ## now we have names

[1] "A" "B" "C" "D" "E"

x[c(1, 3)]

A C 
1 3 

x[c("A", "C")]

A C 
1 3 

heights <- c(2, 4, 4, NA, 6)
mean(heights)

[1] NA

max(heights)

[1] NA

mean(heights, na.rm = TRUE)

[1] 4

max(heights, na.rm = TRUE)

[1] 6

## Extract those elements which are not missing values.
heights[!is.na(heights)]

[1] 2 4 4 6

## Returns the object with incomplete cases removed.
## The returned object is an atomic vector of type `"numeric"`
## (or `"double"`).
na.omit(heights)

[1] 2 4 4 6
attr(,"na.action")
[1] 4
attr(,"class")
[1] "omit"

## Extract those elements which are complete cases.
## The returned object is an atomic vector of type `"numeric"`
## (or `"double"`).
heights[complete.cases(heights)]

[1] 2 4 4 6

heights <- c(63, 69, 60, 65, NA, 68, 61, 70, 61, 59, 64, 69, 63, 63, NA, 72, 65, 64, 70, 63, 65)

heights_no_na <- heights[!is.na(heights)]
## or
heights_no_na <- na.omit(heights)

median(heights, na.rm = TRUE)

[1] 64

heights_above_67 <- heights_no_na[heights_no_na > 67]
length(heights_above_67)

[1] 6

numeric(3)

[1] 0 0 0

numeric(10)

 [1] 0 0 0 0 0 0 0 0 0 0

numeric(0)

numeric(0)

character(2) ## the empty charater

[1] "" ""

logical(2)   ## FALSE

[1] FALSE FALSE

rep(-1, 5)

[1] -1 -1 -1 -1 -1

rep(NA, 5)

[1] NA NA NA NA NA

rep(c(1, 2, 3), 5)

 [1] 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3

rep(c(1, 2, 3), each = 5)

 [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3

sort(rep(c(1, 2, 3), 5))

 [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3

seq(from = 1, to = 20, by = 2)

 [1]  1  3  5  7  9 11 13 15 17 19

seq(1, 5, 1)

[1] 1 2 3 4 5

seq(1, 5) ## default by

[1] 1 2 3 4 5

1:5

[1] 1 2 3 4 5

seq(from = 1, to = 20, length.out = 3)

[1]  1.0 10.5 20.0

sample(1:10)

 [1]  9  4  7  1  2  5  3 10  6  8

sample(letters, 5)

[1] "s" "a" "u" "x" "j"

sample(1:5, 10, replace = TRUE)

 [1] 2 1 5 5 1 1 5 5 2 2

sample(1:10)

 [1]  9  1  4  3  6  2  5  8 10  7

sample(1:10)

 [1]  4  9  7  6  1 10  8  3  2  5

set.seed(123)
sample(1:10)

 [1]  3 10  2  8  6  9  1  7  5  4

set.seed(123)
sample(1:10)

 [1]  3 10  2  8  6  9  1  7  5  4

set.seed(1)
sample(1:10)

 [1]  9  4  7  1  2  5  3 10  6  8

set.seed(1)
sample(1:10)

 [1]  9  4  7  1  2  5  3 10  6  8

rnorm(5)

[1]  0.69641761  0.05351568 -1.31028350 -2.12306606 -0.20807859

rnorm(5, 2, 2)

[1]  1.3744268 -0.1164714  2.8344472  1.3690969  3.6510983

rnorm(5, 100, 5)

[1] 106.45636  96.87448  95.62427 100.71678 107.12595

rna <- read.csv("course-data/data/GSE96870/rnaseq.csv")

rna

head(rna)

     gene     sample expression     organism age    sex  infection  strain time
1     Asl GSM2545336       1170 Mus musculus   8 Female InfluenzaA C57BL/6    8
2    Apod GSM2545336      36194 Mus musculus   8 Female InfluenzaA C57BL/6    8
3 Cyp2d22 GSM2545336       4060 Mus musculus   8 Female InfluenzaA C57BL/6    8
4    Klk6 GSM2545336        287 Mus musculus   8 Female InfluenzaA C57BL/6    8
5   Fcrls GSM2545336         85 Mus musculus   8 Female InfluenzaA C57BL/6    8
6  Slc2a4 GSM2545336        782 Mus musculus   8 Female InfluenzaA C57BL/6    8
      tissue mouse ENTREZID
1 Cerebellum    14   109900
2 Cerebellum    14    11815
3 Cerebellum    14    56448
4 Cerebellum    14    19144
5 Cerebellum    14    80891
6 Cerebellum    14    20528
                                                                       product
1                               argininosuccinate lyase, transcript variant X1
2                                       apolipoprotein D, transcript variant 3
3 cytochrome P450, family 2, subfamily d, polypeptide 22, transcript variant 2
4                         kallikrein related-peptidase 6, transcript variant 2
5                Fc receptor-like S, scavenger receptor, transcript variant X1
6          solute carrier family 2 (facilitated glucose transporter), member 4
     ensembl_gene_id external_synonym chromosome_name   gene_biotype
1 ENSMUSG00000025533    2510006M18Rik               5 protein_coding
2 ENSMUSG00000022548             <NA>              16 protein_coding
3 ENSMUSG00000061740             2D22              15 protein_coding
4 ENSMUSG00000050063             Bssp               7 protein_coding
5 ENSMUSG00000015852    2810439C17Rik               3 protein_coding
6 ENSMUSG00000018566           Glut-4              11 protein_coding
                            phenotype_description
1           abnormal circulating amino acid level
2                      abnormal lipid homeostasis
3                        abnormal skin morphology
4                         abnormal cytokine level
5 decreased CD8-positive alpha-beta T cell number
6              abnormal circulating glucose level
  hsapiens_homolog_associated_gene_name
1                                   ASL
2                                  APOD
3                                CYP2D6
4                                  KLK6
5                                 FCRL2
6                                SLC2A4

## Try also
## View(rna)

rna <- read.table(file = "course-data/data/GSE96870/rnaseq.csv",
                  sep = ",",
                  header = TRUE)

str(rna)

'data.frame':	32428 obs. of  19 variables:
 $ gene                                 : chr  "Asl" "Apod" "Cyp2d22" "Klk6" ...
 $ sample                               : chr  "GSM2545336" "GSM2545336" "GSM2545336" "GSM2545336" ...
 $ expression                           : int  1170 36194 4060 287 85 782 1619 288 43217 1071 ...
 $ organism                             : chr  "Mus musculus" "Mus musculus" "Mus musculus" "Mus musculus" ...
 $ age                                  : int  8 8 8 8 8 8 8 8 8 8 ...
 $ sex                                  : chr  "Female" "Female" "Female" "Female" ...
 $ infection                            : chr  "InfluenzaA" "InfluenzaA" "InfluenzaA" "InfluenzaA" ...
 $ strain                               : chr  "C57BL/6" "C57BL/6" "C57BL/6" "C57BL/6" ...
 $ time                                 : int  8 8 8 8 8 8 8 8 8 8 ...
 $ tissue                               : chr  "Cerebellum" "Cerebellum" "Cerebellum" "Cerebellum" ...
 $ mouse                                : int  14 14 14 14 14 14 14 14 14 14 ...
 $ ENTREZID                             : int  109900 11815 56448 19144 80891 20528 97827 118454 18823 14696 ...
 $ product                              : chr  "argininosuccinate lyase, transcript variant X1" "apolipoprotein D, transcript variant 3" "cytochrome P450, family 2, subfamily d, polypeptide 22, transcript variant 2" "kallikrein related-peptidase 6, transcript variant 2" ...
 $ ensembl_gene_id                      : chr  "ENSMUSG00000025533" "ENSMUSG00000022548" "ENSMUSG00000061740" "ENSMUSG00000050063" ...
 $ external_synonym                     : chr  "2510006M18Rik" NA "2D22" "Bssp" ...
 $ chromosome_name                      : chr  "5" "16" "15" "7" ...
 $ gene_biotype                         : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
 $ phenotype_description                : chr  "abnormal circulating amino acid level" "abnormal lipid homeostasis" "abnormal skin morphology" "abnormal cytokine level" ...
 $ hsapiens_homolog_associated_gene_name: chr  "ASL" "APOD" "CYP2D6" "KLK6" ...

# first element in the first column of the data frame (as a vector)
rna[1, 1]
# first element in the 6th column (as a vector)
rna[1, 6]
# first column of the data frame (as a vector)
rna[, 1]
# first column of the data frame (as a data.frame)
rna[1]
# first three elements in the 7th column (as a vector)
rna[1:3, 7]
# the 3rd row of the data frame (as a data.frame)
rna[3, ]
# equivalent to head_rna <- head(rna)
head_rna <- rna[1:6, ]
head_rna

rna[, -1]          ## The whole data frame, except the first column
rna[-c(7:32428), ] ## Equivalent to head(rna)

rna["gene"]       # Result is a data.frame
rna[, "gene"]     # Result is a vector
rna[["gene"]]     # Result is a vector
rna$gene          # Result is a vector

## 1.
rna_200 <- rna[200, ]
## 2.
## Saving `n_rows` to improve readability and reduce duplication
n_rows <- nrow(rna)
rna_last <- rna[n_rows, ]
## 3.
rna_middle <- rna[n_rows / 2, ]
## 4.
rna_head <- rna[-(7:n_rows), ]

sex <- factor(c("male", "female", "female", "male", "female"))

levels(sex)

[1] "female" "male"  

nlevels(sex)

[1] 2

sex ## current order

[1] male   female female male   female
Levels: female male

sex <- factor(sex, levels = c("male", "female"))
sex ## after re-ordering

[1] male   female female male   female
Levels: male female

as.character(sex)

[1] "male"   "female" "female" "male"   "female"

plot(sex)

levels(sex)

[1] "male"   "female"

levels(sex) <- c("M", "F")
sex

[1] M F F M F
Levels: M F

plot(sex)

animal_data <- data.frame(
       animal = c(dog, cat, sea cucumber, sea urchin),
       feel = c("furry", "squishy", "spiny"),
       weight = c(45, 8 1.1, 0.8))

country_climate <- data.frame(
       country = c("Canada", "Panama", "South Africa", "Australia"),
       climate = c("cold", "hot", "temperate", "hot/temperate"),
       temperature = c(10, 30, 18, "15"),
       northern_hemisphere = c(TRUE, TRUE, FALSE, "FALSE"),
       has_kangaroo = c(FALSE, FALSE, FALSE, 1)
       )

m <- matrix(1:9, ncol = 3, nrow = 3)
m

     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9

## create the matrix
ip <- installed.packages()
head(ip)
## try also View(ip)
## number of package
nrow(ip)
## names of all installed packages
rownames(ip)
## type of information we have about each package
colnames(ip)

set.seed(123)
m <- matrix(rnorm(3000), ncol = 3)
dim(m)

[1] 1000    3

head(m)

            [,1]        [,2]       [,3]
[1,] -0.56047565 -0.99579872 -0.5116037
[2,] -0.23017749 -1.03995504  0.2369379
[3,]  1.55870831 -0.01798024 -0.5415892
[4,]  0.07050839 -0.13217513  1.2192276
[5,]  0.12928774 -2.54934277  0.1741359
[6,]  1.71506499  1.04057346 -0.6152683

l <- list(1:10, ## numeric
          letters, ## character
          installed.packages(), ## a matrix
          cars, ## a data.frame
          list(1, 2, 3)) ## a list
length(l)

[1] 5

str(l)

List of 5
 $ : int [1:10] 1 2 3 4 5 6 7 8 9 10
 $ : chr [1:26] "a" "b" "c" "d" ...
 $ : chr [1:399, 1:16] "abind" "annotate" "AnnotationDbi" "AnnotationHub" ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:399] "abind" "annotate" "AnnotationDbi" "AnnotationHub" ...
  .. ..$ : chr [1:16] "Package" "LibPath" "Version" "Priority" ...
 $ :'data.frame':	50 obs. of  2 variables:
  ..$ speed: num [1:50] 4 4 7 7 8 9 10 10 10 11 ...
  ..$ dist : num [1:50] 2 10 4 22 16 10 18 26 34 17 ...
 $ :List of 3
  ..$ : num 1
  ..$ : num 2
  ..$ : num 3

l[[1]] ## first element

 [1]  1  2  3  4  5  6  7  8  9 10

l[1:2] ## a list of length 2

[[1]]
 [1]  1  2  3  4  5  6  7  8  9 10

[[2]]
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
[20] "t" "u" "v" "w" "x" "y" "z"

l[1]   ## a list of length 1

[[1]]
 [1]  1  2  3  4  5  6  7  8  9 10

write.csv(rna, file = "data_output/rna.csv")

save(rna, file = "data_output/rna.rda")
rm(rna)
load("data_output/rna.rda")
head(rna)

saveRDS(rna, file = "data_output/rna.rds")
rm(rna)
rna <- readRDS("data_output/rna.rds")
head(rna)

# no need to run this command as it's already been installed for you
# BiocManager::install("tidyverse")

## load the tidyverse packages, incl. dplyr
library("tidyverse")

rna <- read_csv("course-data/data/GSE96870/rnaseq.csv")

## view the data
rna

# A tibble: 32,428 × 19
   gene    sample  expression organism   age sex   infection strain  time tissue
   <chr>   <chr>        <dbl> <chr>    <dbl> <chr> <chr>     <chr>  <dbl> <chr> 
 1 Asl     GSM254…       1170 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 2 Apod    GSM254…      36194 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 3 Cyp2d22 GSM254…       4060 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 4 Klk6    GSM254…        287 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 5 Fcrls   GSM254…         85 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 6 Slc2a4  GSM254…        782 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 7 Exd2    GSM254…       1619 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 8 Gjc2    GSM254…        288 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 9 Plp1    GSM254…      43217 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
10 Gnb4    GSM254…       1071 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
# … with 32,418 more rows, and 9 more variables: mouse <dbl>, ENTREZID <dbl>,
#   product <chr>, ensembl_gene_id <chr>, external_synonym <chr>,
#   chromosome_name <chr>, gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>

select(rna, gene, sample, tissue, expression)

# A tibble: 32,428 × 4
   gene    sample     tissue     expression
   <chr>   <chr>      <chr>           <dbl>
 1 Asl     GSM2545336 Cerebellum       1170
 2 Apod    GSM2545336 Cerebellum      36194
 3 Cyp2d22 GSM2545336 Cerebellum       4060
 4 Klk6    GSM2545336 Cerebellum        287
 5 Fcrls   GSM2545336 Cerebellum         85
 6 Slc2a4  GSM2545336 Cerebellum        782
 7 Exd2    GSM2545336 Cerebellum       1619
 8 Gjc2    GSM2545336 Cerebellum        288
 9 Plp1    GSM2545336 Cerebellum      43217
10 Gnb4    GSM2545336 Cerebellum       1071
# … with 32,418 more rows

select(rna, -tissue, -organism)

# A tibble: 32,428 × 17
   gene    sample   expression   age sex   infection strain  time mouse ENTREZID
   <chr>   <chr>         <dbl> <dbl> <chr> <chr>     <chr>  <dbl> <dbl>    <dbl>
 1 Asl     GSM2545…       1170     8 Fema… Influenz… C57BL…     8    14   109900
 2 Apod    GSM2545…      36194     8 Fema… Influenz… C57BL…     8    14    11815
 3 Cyp2d22 GSM2545…       4060     8 Fema… Influenz… C57BL…     8    14    56448
 4 Klk6    GSM2545…        287     8 Fema… Influenz… C57BL…     8    14    19144
 5 Fcrls   GSM2545…         85     8 Fema… Influenz… C57BL…     8    14    80891
 6 Slc2a4  GSM2545…        782     8 Fema… Influenz… C57BL…     8    14    20528
 7 Exd2    GSM2545…       1619     8 Fema… Influenz… C57BL…     8    14    97827
 8 Gjc2    GSM2545…        288     8 Fema… Influenz… C57BL…     8    14   118454
 9 Plp1    GSM2545…      43217     8 Fema… Influenz… C57BL…     8    14    18823
10 Gnb4    GSM2545…       1071     8 Fema… Influenz… C57BL…     8    14    14696
# … with 32,418 more rows, and 7 more variables: product <chr>,
#   ensembl_gene_id <chr>, external_synonym <chr>, chromosome_name <chr>,
#   gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>

filter(rna, sex == "Male")

# A tibble: 14,740 × 19
   gene    sample  expression organism   age sex   infection strain  time tissue
   <chr>   <chr>        <dbl> <chr>    <dbl> <chr> <chr>     <chr>  <dbl> <chr> 
 1 Asl     GSM254…        626 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 2 Apod    GSM254…      13021 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 3 Cyp2d22 GSM254…       2171 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 4 Klk6    GSM254…        448 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 5 Fcrls   GSM254…        180 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 6 Slc2a4  GSM254…        313 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 7 Exd2    GSM254…       2366 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 8 Gjc2    GSM254…        310 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 9 Plp1    GSM254…      53126 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
10 Gnb4    GSM254…       1355 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
# … with 14,730 more rows, and 9 more variables: mouse <dbl>, ENTREZID <dbl>,
#   product <chr>, ensembl_gene_id <chr>, external_synonym <chr>,
#   chromosome_name <chr>, gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>

filter(rna, sex == "Male" & infection == "NonInfected")

# A tibble: 4,422 × 19
   gene    sample  expression organism   age sex   infection strain  time tissue
   <chr>   <chr>        <dbl> <chr>    <dbl> <chr> <chr>     <chr>  <dbl> <chr> 
 1 Asl     GSM254…        535 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 2 Apod    GSM254…      13668 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 3 Cyp2d22 GSM254…       2008 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 4 Klk6    GSM254…       1101 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 5 Fcrls   GSM254…        375 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 6 Slc2a4  GSM254…        249 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 7 Exd2    GSM254…       3126 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 8 Gjc2    GSM254…        791 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 9 Plp1    GSM254…      98658 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
10 Gnb4    GSM254…       2437 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
# … with 4,412 more rows, and 9 more variables: mouse <dbl>, ENTREZID <dbl>,
#   product <chr>, ensembl_gene_id <chr>, external_synonym <chr>,
#   chromosome_name <chr>, gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>

rna_NA <- filter(rna, is.na(hsapiens_homolog_associated_gene_name))
select(rna_NA, gene, hsapiens_homolog_associated_gene_name)

# A tibble: 4,290 × 2
   gene     hsapiens_homolog_associated_gene_name
   <chr>    <chr>                                
 1 Prodh    <NA>                                 
 2 Tssk5    <NA>                                 
 3 Vmn2r1   <NA>                                 
 4 Gm10654  <NA>                                 
 5 Hexa     <NA>                                 
 6 Sult1a1  <NA>                                 
 7 Gm6277   <NA>                                 
 8 Tmem198b <NA>                                 
 9 Adam1a   <NA>                                 
10 Ebp      <NA>                                 
# … with 4,280 more rows

rna_no_NA <- filter(rna, !is.na(hsapiens_homolog_associated_gene_name))
select(rna_no_NA, gene, hsapiens_homolog_associated_gene_name)

# A tibble: 28,138 × 2
   gene    hsapiens_homolog_associated_gene_name
   <chr>   <chr>                                
 1 Asl     ASL                                  
 2 Apod    APOD                                 
 3 Cyp2d22 CYP2D6                               
 4 Klk6    KLK6                                 
 5 Fcrls   FCRL2                                
 6 Slc2a4  SLC2A4                               
 7 Exd2    EXD2                                 
 8 Gjc2    GJC2                                 
 9 Plp1    PLP1                                 
10 Gnb4    GNB4                                 
# … with 28,128 more rows

rna2 <- filter(rna, sex == "Male")
rna3 <- select(rna2, gene, sample, tissue, expression)
rna3

# A tibble: 14,740 × 4
   gene    sample     tissue     expression
   <chr>   <chr>      <chr>           <dbl>
 1 Asl     GSM2545340 Cerebellum        626
 2 Apod    GSM2545340 Cerebellum      13021
 3 Cyp2d22 GSM2545340 Cerebellum       2171
 4 Klk6    GSM2545340 Cerebellum        448
 5 Fcrls   GSM2545340 Cerebellum        180
 6 Slc2a4  GSM2545340 Cerebellum        313
 7 Exd2    GSM2545340 Cerebellum       2366
 8 Gjc2    GSM2545340 Cerebellum        310
 9 Plp1    GSM2545340 Cerebellum      53126
10 Gnb4    GSM2545340 Cerebellum       1355
# … with 14,730 more rows

rna3 <- select(filter(rna, sex == "Male"), gene, sample, tissue, expression)
rna3

# A tibble: 14,740 × 4
   gene    sample     tissue     expression
   <chr>   <chr>      <chr>           <dbl>
 1 Asl     GSM2545340 Cerebellum        626
 2 Apod    GSM2545340 Cerebellum      13021
 3 Cyp2d22 GSM2545340 Cerebellum       2171
 4 Klk6    GSM2545340 Cerebellum        448
 5 Fcrls   GSM2545340 Cerebellum        180
 6 Slc2a4  GSM2545340 Cerebellum        313
 7 Exd2    GSM2545340 Cerebellum       2366
 8 Gjc2    GSM2545340 Cerebellum        310
 9 Plp1    GSM2545340 Cerebellum      53126
10 Gnb4    GSM2545340 Cerebellum       1355
# … with 14,730 more rows

rna %>%
  filter(sex == "Male") %>%
  select(gene, sample, tissue, expression)

# A tibble: 14,740 × 4
   gene    sample     tissue     expression
   <chr>   <chr>      <chr>           <dbl>
 1 Asl     GSM2545340 Cerebellum        626
 2 Apod    GSM2545340 Cerebellum      13021
 3 Cyp2d22 GSM2545340 Cerebellum       2171
 4 Klk6    GSM2545340 Cerebellum        448
 5 Fcrls   GSM2545340 Cerebellum        180
 6 Slc2a4  GSM2545340 Cerebellum        313
 7 Exd2    GSM2545340 Cerebellum       2366
 8 Gjc2    GSM2545340 Cerebellum        310
 9 Plp1    GSM2545340 Cerebellum      53126
10 Gnb4    GSM2545340 Cerebellum       1355
# … with 14,730 more rows

rna3 <- rna %>%
  filter(sex == "Male") %>%
  select(gene, sample, tissue, expression)

rna3

# A tibble: 14,740 × 4
   gene    sample     tissue     expression
   <chr>   <chr>      <chr>           <dbl>
 1 Asl     GSM2545340 Cerebellum        626
 2 Apod    GSM2545340 Cerebellum      13021
 3 Cyp2d22 GSM2545340 Cerebellum       2171
 4 Klk6    GSM2545340 Cerebellum        448
 5 Fcrls   GSM2545340 Cerebellum        180
 6 Slc2a4  GSM2545340 Cerebellum        313
 7 Exd2    GSM2545340 Cerebellum       2366
 8 Gjc2    GSM2545340 Cerebellum        310
 9 Plp1    GSM2545340 Cerebellum      53126
10 Gnb4    GSM2545340 Cerebellum       1355
# … with 14,730 more rows

rna %>%
  filter(expression > 50000,
         sex == "Female",
         time == 0 ) %>%
  select(gene, sample, time, expression, age)

# A tibble: 9 × 5
  gene   sample      time expression   age
  <chr>  <chr>      <dbl>      <dbl> <dbl>
1 Plp1   GSM2545337     0     101241     8
2 Atp1b1 GSM2545337     0      53260     8
3 Plp1   GSM2545338     0      96534     8
4 Atp1b1 GSM2545338     0      50614     8
5 Plp1   GSM2545348     0     102790     8
6 Atp1b1 GSM2545348     0      59544     8
7 Plp1   GSM2545353     0      71237     8
8 Glul   GSM2545353     0      52451     8
9 Atp1b1 GSM2545353     0      61451     8

rna %>%
  mutate(time_hours = time * 24) %>%
  select(time, time_hours)

# A tibble: 32,428 × 2
    time time_hours
   <dbl>      <dbl>
 1     8        192
 2     8        192
 3     8        192
 4     8        192
 5     8        192
 6     8        192
 7     8        192
 8     8        192
 9     8        192
10     8        192
# … with 32,418 more rows

rna %>%
  mutate(time_hours = time * 24,
         time_mn = time_hours * 60) %>%
  select(time, time_hours, time_mn)

# A tibble: 32,428 × 3
    time time_hours time_mn
   <dbl>      <dbl>   <dbl>
 1     8        192   11520
 2     8        192   11520
 3     8        192   11520
 4     8        192   11520
 5     8        192   11520
 6     8        192   11520
 7     8        192   11520
 8     8        192   11520
 9     8        192   11520
10     8        192   11520
# … with 32,418 more rows

rna %>%
  filter(chromosome_name != "X", chromosome_name != "Y") %>%
  mutate(log_expression = log(expression)) %>%
  select(gene, chromosome_name, phenotype_description, sample, log_expression) %>%
  filter(!is.na(phenotype_description))

# A tibble: 21,054 × 5
   gene    chromosome_name phenotype_description           sample log_expression
   <chr>   <chr>           <chr>                           <chr>           <dbl>
 1 Asl     5               abnormal circulating amino aci… GSM25…           7.06
 2 Apod    16              abnormal lipid homeostasis      GSM25…          10.5 
 3 Cyp2d22 15              abnormal skin morphology        GSM25…           8.31
 4 Klk6    7               abnormal cytokine level         GSM25…           5.66
 5 Fcrls   3               decreased CD8-positive alpha-b… GSM25…           4.44
 6 Slc2a4  11              abnormal circulating glucose l… GSM25…           6.66
 7 Gjc2    11              Purkinje cell degeneration      GSM25…           5.66
 8 Gnb4    3               decreased anxiety-related resp… GSM25…           6.98
 9 Tnc     4               abnormal CNS synaptic transmis… GSM25…           5.39
10 Trf     9               abnormal circulating phosphate… GSM25…           9.18
# … with 21,044 more rows

rna %>%
  group_by(gene)

# A tibble: 32,428 × 19
# Groups:   gene [1,474]
   gene    sample  expression organism   age sex   infection strain  time tissue
   <chr>   <chr>        <dbl> <chr>    <dbl> <chr> <chr>     <chr>  <dbl> <chr> 
 1 Asl     GSM254…       1170 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 2 Apod    GSM254…      36194 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 3 Cyp2d22 GSM254…       4060 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 4 Klk6    GSM254…        287 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 5 Fcrls   GSM254…         85 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 6 Slc2a4  GSM254…        782 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 7 Exd2    GSM254…       1619 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 8 Gjc2    GSM254…        288 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 9 Plp1    GSM254…      43217 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
10 Gnb4    GSM254…       1071 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
# … with 32,418 more rows, and 9 more variables: mouse <dbl>, ENTREZID <dbl>,
#   product <chr>, ensembl_gene_id <chr>, external_synonym <chr>,
#   chromosome_name <chr>, gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>

rna %>%
  group_by(sample)

# A tibble: 32,428 × 19
# Groups:   sample [22]
   gene    sample  expression organism   age sex   infection strain  time tissue
   <chr>   <chr>        <dbl> <chr>    <dbl> <chr> <chr>     <chr>  <dbl> <chr> 
 1 Asl     GSM254…       1170 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 2 Apod    GSM254…      36194 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 3 Cyp2d22 GSM254…       4060 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 4 Klk6    GSM254…        287 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 5 Fcrls   GSM254…         85 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 6 Slc2a4  GSM254…        782 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 7 Exd2    GSM254…       1619 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 8 Gjc2    GSM254…        288 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 9 Plp1    GSM254…      43217 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
10 Gnb4    GSM254…       1071 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
# … with 32,418 more rows, and 9 more variables: mouse <dbl>, ENTREZID <dbl>,
#   product <chr>, ensembl_gene_id <chr>, external_synonym <chr>,
#   chromosome_name <chr>, gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>

rna %>%
  group_by(gene) %>%
  summarize(mean_expression = mean(expression))

# A tibble: 1,474 × 2
   gene    mean_expression
   <chr>             <dbl>
 1 Aamp            4751.  
 2 Abca12             4.55
 3 Abcc8           2498.  
 4 Abhd14a          525.  
 5 Abi2            4909.  
 6 Abi3bp          1002.  
 7 Abl2            2124.  
 8 Acadl           2053.  
 9 Acap3           3536.  
10 Acbd4           1431.  
# … with 1,464 more rows

rna %>%
  group_by(sample) %>%
  summarize(mean_expression = mean(expression))

# A tibble: 22 × 2
   sample     mean_expression
   <chr>                <dbl>
 1 GSM2545336           2062.
 2 GSM2545337           1766.
 3 GSM2545338           1668.
 4 GSM2545339           1696.
 5 GSM2545340           1682.
 6 GSM2545341           1638.
 7 GSM2545342           1594.
 8 GSM2545343           2107.
 9 GSM2545344           1712.
10 GSM2545345           1700.
# … with 12 more rows

rna %>%
  group_by(gene, infection, time) %>%
  summarize(mean_expression = mean(expression))

`summarise()` has grouped output by 'gene', 'infection'. You can override using
the `.groups` argument.

# A tibble: 4,422 × 4
# Groups:   gene, infection [2,948]
   gene    infection    time mean_expression
   <chr>   <chr>       <dbl>           <dbl>
 1 Aamp    InfluenzaA      4         4870   
 2 Aamp    InfluenzaA      8         4763.  
 3 Aamp    NonInfected     0         4603.  
 4 Abca12  InfluenzaA      4            4.25
 5 Abca12  InfluenzaA      8            4.14
 6 Abca12  NonInfected     0            5.29
 7 Abcc8   InfluenzaA      4         2609.  
 8 Abcc8   InfluenzaA      8         2292.  
 9 Abcc8   NonInfected     0         2576.  
10 Abhd14a InfluenzaA      4          547.  
# … with 4,412 more rows

rna %>%
  group_by(gene, infection, time) %>%
  summarize(mean_expression = mean(expression),
            median_expression = median(expression))

`summarise()` has grouped output by 'gene', 'infection'. You can override using
the `.groups` argument.

# A tibble: 4,422 × 5
# Groups:   gene, infection [2,948]
   gene    infection    time mean_expression median_expression
   <chr>   <chr>       <dbl>           <dbl>             <dbl>
 1 Aamp    InfluenzaA      4         4870               4708  
 2 Aamp    InfluenzaA      8         4763.              4813  
 3 Aamp    NonInfected     0         4603.              4717  
 4 Abca12  InfluenzaA      4            4.25               4.5
 5 Abca12  InfluenzaA      8            4.14               4  
 6 Abca12  NonInfected     0            5.29               5  
 7 Abcc8   InfluenzaA      4         2609.              2424. 
 8 Abcc8   InfluenzaA      8         2292.              2224  
 9 Abcc8   NonInfected     0         2576.              2578  
10 Abhd14a InfluenzaA      4          547.               523  
# … with 4,412 more rows

rna %>%
  filter(gene == "Dok3") %>%
  group_by(time) %>%
  summarize(mean = mean(expression))

# A tibble: 3 × 2
   time  mean
  <dbl> <dbl>
1     0  169 
2     4  156.
3     8   61 

rna %>%
    count(infection)

# A tibble: 2 × 2
  infection       n
  <chr>       <int>
1 InfluenzaA  22110
2 NonInfected 10318

rna %>%
    group_by(infection) %>%
    summarise(n = n())

# A tibble: 2 × 2
  infection       n
  <chr>       <int>
1 InfluenzaA  22110
2 NonInfected 10318

rna %>%
    count(infection, time)

# A tibble: 3 × 3
  infection    time     n
  <chr>       <dbl> <int>
1 InfluenzaA      4 11792
2 InfluenzaA      8 10318
3 NonInfected     0 10318

rna %>%
  group_by(infection, time) %>%
  summarize(n = n())

`summarise()` has grouped output by 'infection'. You can override using the
`.groups` argument.

# A tibble: 3 × 3
# Groups:   infection [2]
  infection    time     n
  <chr>       <dbl> <int>
1 InfluenzaA      4 11792
2 InfluenzaA      8 10318
3 NonInfected     0 10318

rna %>%
  count(infection, time) %>%
  arrange(time)

# A tibble: 3 × 3
  infection    time     n
  <chr>       <dbl> <int>
1 NonInfected     0 10318
2 InfluenzaA      4 11792
3 InfluenzaA      8 10318

rna %>%
  count(infection, time) %>%
  arrange(n)

# A tibble: 3 × 3
  infection    time     n
  <chr>       <dbl> <int>
1 InfluenzaA      8 10318
2 NonInfected     0 10318
3 InfluenzaA      4 11792

rna %>%
  count(infection, time) %>%
  arrange(desc(n))

# A tibble: 3 × 3
  infection    time     n
  <chr>       <dbl> <int>
1 InfluenzaA      4 11792
2 InfluenzaA      8 10318
3 NonInfected     0 10318

## 1.
rna %>%
  count(sample)

# A tibble: 22 × 2
   sample         n
   <chr>      <int>
 1 GSM2545336  1474
 2 GSM2545337  1474
 3 GSM2545338  1474
 4 GSM2545339  1474
 5 GSM2545340  1474
 6 GSM2545341  1474
 7 GSM2545342  1474
 8 GSM2545343  1474
 9 GSM2545344  1474
10 GSM2545345  1474
# … with 12 more rows

## 2.
rna %>%
  group_by(sample) %>%
  summarize(seq_depth = sum(expression)) %>%
  arrange(desc(seq_depth))

# A tibble: 22 × 2
   sample     seq_depth
   <chr>          <dbl>
 1 GSM2545350   3255566
 2 GSM2545352   3216163
 3 GSM2545343   3105652
 4 GSM2545336   3039671
 5 GSM2545380   3036098
 6 GSM2545353   2953249
 7 GSM2545348   2913678
 8 GSM2545362   2913517
 9 GSM2545351   2782464
10 GSM2545349   2758006
# … with 12 more rows

## 3.
rna %>%
  filter(sample == "GSM2545336") %>%
  count(gene_biotype) %>%
  arrange(desc(n))

# A tibble: 13 × 2
   gene_biotype                           n
   <chr>                              <int>
 1 protein_coding                      1321
 2 lncRNA                                69
 3 processed_pseudogene                  59
 4 miRNA                                  7
 5 snoRNA                                 5
 6 TEC                                    4
 7 polymorphic_pseudogene                 2
 8 unprocessed_pseudogene                 2
 9 IG_C_gene                              1
10 scaRNA                                 1
11 transcribed_processed_pseudogene       1
12 transcribed_unitary_pseudogene         1
13 transcribed_unprocessed_pseudogene     1

## 4.
rna %>%
  filter(phenotype_description == "abnormal DNA methylation") %>%
  group_by(gene, time) %>%
  summarize(mean_expression = mean(log(expression))) %>%
  arrange()

`summarise()` has grouped output by 'gene'. You can override using the
`.groups` argument.

# A tibble: 6 × 3
# Groups:   gene [2]
  gene   time mean_expression
  <chr> <dbl>           <dbl>
1 Xist      0            6.95
2 Xist      4            6.34
3 Xist      8            7.13
4 Zdbf2     0            6.27
5 Zdbf2     4            6.27
6 Zdbf2     8            6.19

rna %>%
  arrange(gene)

# A tibble: 32,428 × 19
   gene  sample    expression organism   age sex   infection strain  time tissue
   <chr> <chr>          <dbl> <chr>    <dbl> <chr> <chr>     <chr>  <dbl> <chr> 
 1 Aamp  GSM25453…       5621 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 2 Aamp  GSM25453…       4049 Mus mus…     8 Fema… NonInfec… C57BL…     0 Cereb…
 3 Aamp  GSM25453…       3797 Mus mus…     8 Fema… NonInfec… C57BL…     0 Cereb…
 4 Aamp  GSM25453…       4375 Mus mus…     8 Fema… Influenz… C57BL…     4 Cereb…
 5 Aamp  GSM25453…       4095 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
 6 Aamp  GSM25453…       3867 Mus mus…     8 Male  Influenz… C57BL…     8 Cereb…
 7 Aamp  GSM25453…       3578 Mus mus…     8 Fema… Influenz… C57BL…     8 Cereb…
 8 Aamp  GSM25453…       5097 Mus mus…     8 Male  NonInfec… C57BL…     0 Cereb…
 9 Aamp  GSM25453…       4202 Mus mus…     8 Fema… Influenz… C57BL…     4 Cereb…
10 Aamp  GSM25453…       4701 Mus mus…     8 Male  Influenz… C57BL…     4 Cereb…
# … with 32,418 more rows, and 9 more variables: mouse <dbl>, ENTREZID <dbl>,
#   product <chr>, ensembl_gene_id <chr>, external_synonym <chr>,
#   chromosome_name <chr>, gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>

# A tibble: 1,474 × 23
   gene    GSM2545336 GSM2545337 GSM2545338 GSM2545339 GSM2545340 GSM2545341
   <chr>        <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
 1 Asl           1170        361        400        586        626        988
 2 Apod         36194      10347       9173      10620      13021      29594
 3 Cyp2d22       4060       1616       1603       1901       2171       3349
 4 Klk6           287        629        641        578        448        195
 5 Fcrls           85        233        244        237        180         38
 6 Slc2a4         782        231        248        265        313        786
 7 Exd2          1619       2288       2235       2513       2366       1359
 8 Gjc2           288        595        568        551        310        146
 9 Plp1         43217     101241      96534      58354      53126      27173
10 Gnb4          1071       1791       1867       1430       1355        798
# … with 1,464 more rows, and 16 more variables: GSM2545342 <dbl>,
#   GSM2545343 <dbl>, GSM2545344 <dbl>, GSM2545345 <dbl>, GSM2545346 <dbl>,
#   GSM2545347 <dbl>, GSM2545348 <dbl>, GSM2545349 <dbl>, GSM2545350 <dbl>,
#   GSM2545351 <dbl>, GSM2545352 <dbl>, GSM2545353 <dbl>, GSM2545354 <dbl>,
#   GSM2545362 <dbl>, GSM2545363 <dbl>, GSM2545380 <dbl>

rna_exp <- rna %>%
  select(gene, sample, expression)
rna_exp

# A tibble: 32,428 × 3
   gene    sample     expression
   <chr>   <chr>           <dbl>
 1 Asl     GSM2545336       1170
 2 Apod    GSM2545336      36194
 3 Cyp2d22 GSM2545336       4060
 4 Klk6    GSM2545336        287
 5 Fcrls   GSM2545336         85
 6 Slc2a4  GSM2545336        782
 7 Exd2    GSM2545336       1619
 8 Gjc2    GSM2545336        288
 9 Plp1    GSM2545336      43217
10 Gnb4    GSM2545336       1071
# … with 32,418 more rows

rna_wide <- rna_exp %>%
  pivot_wider(names_from = sample,
              values_from = expression)
rna_wide

# A tibble: 1,474 × 23
   gene    GSM2545336 GSM2545337 GSM2545338 GSM2545339 GSM2545340 GSM2545341
   <chr>        <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
 1 Asl           1170        361        400        586        626        988
 2 Apod         36194      10347       9173      10620      13021      29594
 3 Cyp2d22       4060       1616       1603       1901       2171       3349
 4 Klk6           287        629        641        578        448        195
 5 Fcrls           85        233        244        237        180         38
 6 Slc2a4         782        231        248        265        313        786
 7 Exd2          1619       2288       2235       2513       2366       1359
 8 Gjc2           288        595        568        551        310        146
 9 Plp1         43217     101241      96534      58354      53126      27173
10 Gnb4          1071       1791       1867       1430       1355        798
# … with 1,464 more rows, and 16 more variables: GSM2545342 <dbl>,
#   GSM2545343 <dbl>, GSM2545344 <dbl>, GSM2545345 <dbl>, GSM2545346 <dbl>,
#   GSM2545347 <dbl>, GSM2545348 <dbl>, GSM2545349 <dbl>, GSM2545350 <dbl>,
#   GSM2545351 <dbl>, GSM2545352 <dbl>, GSM2545353 <dbl>, GSM2545354 <dbl>,
#   GSM2545362 <dbl>, GSM2545363 <dbl>, GSM2545380 <dbl>

rna_long <- rna_wide %>%
    pivot_longer(names_to = "sample",
                 values_to = "expression",
                 -gene)
rna_long

# A tibble: 32,428 × 3
   gene  sample     expression
   <chr> <chr>           <dbl>
 1 Asl   GSM2545336       1170
 2 Asl   GSM2545337        361
 3 Asl   GSM2545338        400
 4 Asl   GSM2545339        586
 5 Asl   GSM2545340        626
 6 Asl   GSM2545341        988
 7 Asl   GSM2545342        836
 8 Asl   GSM2545343        535
 9 Asl   GSM2545344        586
10 Asl   GSM2545345        597
# … with 32,418 more rows

rna_wide %>%
    pivot_longer(names_to = "sample",
                 values_to = "expression",
                 cols = starts_with("GSM"))

# A tibble: 32,428 × 3
   gene  sample     expression
   <chr> <chr>           <dbl>
 1 Asl   GSM2545336       1170
 2 Asl   GSM2545337        361
 3 Asl   GSM2545338        400
 4 Asl   GSM2545339        586
 5 Asl   GSM2545340        626
 6 Asl   GSM2545341        988
 7 Asl   GSM2545342        836
 8 Asl   GSM2545343        535
 9 Asl   GSM2545344        586
10 Asl   GSM2545345        597
# … with 32,418 more rows

rna_wide %>%
    pivot_longer(names_to = "sample",
                 values_to = "expression",
                 GSM2545336:GSM2545380)

# A tibble: 32,428 × 3
   gene  sample     expression
   <chr> <chr>           <dbl>
 1 Asl   GSM2545336       1170
 2 Asl   GSM2545337        361
 3 Asl   GSM2545338        400
 4 Asl   GSM2545339        586
 5 Asl   GSM2545340        626
 6 Asl   GSM2545341        988
 7 Asl   GSM2545342        836
 8 Asl   GSM2545343        535
 9 Asl   GSM2545344        586
10 Asl   GSM2545345        597
# … with 32,418 more rows

 rna %>%
  filter(chromosome_name == "Y" | chromosome_name == "X") %>%
  group_by(sex, chromosome_name) %>%
  summarize(mean = mean(expression))

`summarise()` has grouped output by 'sex'. You can override using the `.groups`
argument.

# A tibble: 4 × 3
# Groups:   sex [2]
  sex    chromosome_name  mean
  <chr>  <chr>           <dbl>
1 Female X               3504.
2 Female Y                  3 
3 Male   X               2497.
4 Male   Y               2117.

rna_1 <- rna %>%
  filter(chromosome_name == "Y" | chromosome_name == "X") %>%
  group_by(sex, chromosome_name) %>%
  summarize(mean = mean(expression)) %>%
  pivot_wider(names_from = sex,
              values_from = mean)

`summarise()` has grouped output by 'sex'. You can override using the `.groups`
argument.

rna_1

# A tibble: 2 × 3
  chromosome_name Female  Male
  <chr>            <dbl> <dbl>
1 X                3504. 2497.
2 Y                   3  2117.

rna_1 %>%
  pivot_longer(names_to = "gender",
               values_to = "mean",
               - chromosome_name)

# A tibble: 4 × 3
  chromosome_name gender  mean
  <chr>           <chr>  <dbl>
1 X               Female 3504.
2 X               Male   2497.
3 Y               Female    3 
4 Y               Male   2117.

 rna %>%
   group_by(gene, time) %>%
 summarize(mean_exp = mean(expression))

`summarise()` has grouped output by 'gene'. You can override using the
`.groups` argument.

# A tibble: 4,422 × 3
# Groups:   gene [1,474]
   gene     time mean_exp
   <chr>   <dbl>    <dbl>
 1 Aamp        0  4603.  
 2 Aamp        4  4870   
 3 Aamp        8  4763.  
 4 Abca12      0     5.29
 5 Abca12      4     4.25
 6 Abca12      8     4.14
 7 Abcc8       0  2576.  
 8 Abcc8       4  2609.  
 9 Abcc8       8  2292.  
10 Abhd14a     0   591.  
# … with 4,412 more rows

rna_time <- rna %>%
 group_by(gene, time) %>%
 summarize(mean_exp = mean(expression)) %>%
 pivot_wider(names_from = time,
          values_from = mean_exp)

`summarise()` has grouped output by 'gene'. You can override using the
`.groups` argument.

rna_time

# A tibble: 1,474 × 4
# Groups:   gene [1,474]
   gene        `0`     `4`     `8`
   <chr>     <dbl>   <dbl>   <dbl>
 1 Aamp    4603.   4870    4763.  
 2 Abca12     5.29    4.25    4.14
 3 Abcc8   2576.   2609.   2292.  
 4 Abhd14a  591.    547.    432.  
 5 Abi2    4881.   4903.   4945.  
 6 Abi3bp  1175.   1061.    762.  
 7 Abl2    2170.   2078.   2131.  
 8 Acadl   2059.   2099    1995.  
 9 Acap3   3745    3446.   3431.  
10 Acbd4   1219.   1410.   1668.  
# … with 1,464 more rows

 rna %>%
  group_by(gene, time) %>%
  summarize(mean_exp = mean(expression)) %>%
  pivot_wider(names_from = time,
              values_from = mean_exp) %>%
   select(gene, 4)

`summarise()` has grouped output by 'gene'. You can override using the
`.groups` argument.

# A tibble: 1,474 × 2
# Groups:   gene [1,474]
   gene        `8`
   <chr>     <dbl>
 1 Aamp    4763.  
 2 Abca12     4.14
 3 Abcc8   2292.  
 4 Abhd14a  432.  
 5 Abi2    4945.  
 6 Abi3bp   762.  
 7 Abl2    2131.  
 8 Acadl   1995.  
 9 Acap3   3431.  
10 Acbd4   1668.  
# … with 1,464 more rows

 rna %>%
  group_by(gene, time) %>%
  summarize(mean_exp = mean(expression)) %>%
  pivot_wider(names_from = time,
               values_from = mean_exp) %>%
  select(gene, `4`)

`summarise()` has grouped output by 'gene'. You can override using the
`.groups` argument.

# A tibble: 1,474 × 2
# Groups:   gene [1,474]
   gene        `4`
   <chr>     <dbl>
 1 Aamp    4870   
 2 Abca12     4.25
 3 Abcc8   2609.  
 4 Abhd14a  547.  
 5 Abi2    4903.  
 6 Abi3bp  1061.  
 7 Abl2    2078.  
 8 Acadl   2099   
 9 Acap3   3446.  
10 Acbd4   1410.  
# … with 1,464 more rows

rna_time <- rna %>%
  group_by(gene, time) %>%
  summarize(mean_exp = mean(expression)) %>%
  pivot_wider(names_from = time,
               values_from = mean_exp) %>%
  rename("time0" = `0`, "time4" = `4`, "time8" = `8`)

`summarise()` has grouped output by 'gene'. You can override using the
`.groups` argument.

 rna_time %>%
  mutate(time_8_vs_0 = time8 / time0, time_8_vs_4 = time8 / time4)

# A tibble: 1,474 × 6
# Groups:   gene [1,474]
   gene      time0   time4   time8 time_8_vs_0 time_8_vs_4
   <chr>     <dbl>   <dbl>   <dbl>       <dbl>       <dbl>
 1 Aamp    4603.   4870    4763.         1.03        0.978
 2 Abca12     5.29    4.25    4.14       0.784       0.975
 3 Abcc8   2576.   2609.   2292.         0.889       0.878
 4 Abhd14a  591.    547.    432.         0.731       0.791
 5 Abi2    4881.   4903.   4945.         1.01        1.01 
 6 Abi3bp  1175.   1061.    762.         0.649       0.719
 7 Abl2    2170.   2078.   2131.         0.982       1.03 
 8 Acadl   2059.   2099    1995.         0.969       0.950
 9 Acap3   3745    3446.   3431.         0.916       0.996
10 Acbd4   1219.   1410.   1668.         1.37        1.18 
# … with 1,464 more rows

 rna_time %>%
   mutate(time_8_vs_0 = time8 / time0, time_8_vs_4 = time8 / time4) %>%
  pivot_longer(names_to = "comparisons",
                values_to = "Fold_changes",
               time_8_vs_0:time_8_vs_4)

# A tibble: 2,948 × 6
# Groups:   gene [1,474]
   gene      time0   time4   time8 comparisons Fold_changes
   <chr>     <dbl>   <dbl>   <dbl> <chr>              <dbl>
 1 Aamp    4603.   4870    4763.   time_8_vs_0        1.03 
 2 Aamp    4603.   4870    4763.   time_8_vs_4        0.978
 3 Abca12     5.29    4.25    4.14 time_8_vs_0        0.784
 4 Abca12     5.29    4.25    4.14 time_8_vs_4        0.975
 5 Abcc8   2576.   2609.   2292.   time_8_vs_0        0.889
 6 Abcc8   2576.   2609.   2292.   time_8_vs_4        0.878
 7 Abhd14a  591.    547.    432.   time_8_vs_0        0.731
 8 Abhd14a  591.    547.    432.   time_8_vs_4        0.791
 9 Abi2    4881.   4903.   4945.   time_8_vs_0        1.01 
10 Abi2    4881.   4903.   4945.   time_8_vs_4        1.01 
# … with 2,938 more rows

write_csv(rna_wide, file = "data_output/rna_wide.csv")

library("tidyverse")

rna <- read.csv("course-data/data/GSE96870/rnaseq.csv")

ggplot(data = <DATA>, mapping = aes(<MAPPINGS>)) +  <GEOM_FUNCTION>()

ggplot(data = rna)

ggplot(data = rna, mapping = aes(x = expression))

* `geom_point()` for scatter plots, dot plots, etc.
* `geom_histogram()` for histograms
* `geom_boxplot()` for, well, boxplots!
* `geom_line()` for trend lines, time series, etc.

ggplot(data = rna, mapping = aes(x = expression)) +
  geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Assign plot to a variable
rna_plot <- ggplot(data = rna,
                   mapping = aes(x = expression))

# Draw the plot
rna_plot + geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# change bins
ggplot(rna, aes(x = expression)) +
    geom_histogram(bins = 15)

# change binwidth
ggplot(rna, aes(x = expression)) +
    geom_histogram(binwidth = 2000)

rna <- rna %>%
  mutate(expression_log = log2(expression + 1))

ggplot(rna, aes(x = expression_log)) + geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = rna,mapping = aes(x = expression))+
  geom_histogram() +
  scale_x_log10()

Warning: Transformation introduced infinite values in continuous x-axis

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning: Removed 507 rows containing non-finite values (stat_bin).

# This is the correct syntax for adding layers
rna_plot +
  geom_histogram()

# This will not add the new layer and will return an error message
rna_plot
  + geom_histogram()

rna_fc <- rna %>% select(gene, time,
                         gene_biotype, expression_log) %>%
  group_by(gene, time, gene_biotype) %>%
  summarize(mean_exp = mean(expression_log)) %>%
  pivot_wider(names_from = time,
              values_from = mean_exp) %>%
  mutate(time_8_vs_0 = `8` - `0`, time_4_vs_0 = `4` - `0`)

`summarise()` has grouped output by 'gene', 'time'. You can override using the
`.groups` argument.

ggplot(data = rna_fc, mapping = aes(x = time_4_vs_0, y = time_8_vs_0)) +
  geom_point()

ggplot(data = rna_fc, mapping = aes(x = time_4_vs_0, y = time_8_vs_0)) +
  geom_point(alpha = 0.3)

ggplot(data = rna_fc, mapping = aes(x = time_4_vs_0, y = time_8_vs_0)) +
  geom_point(alpha = 0.3, color = "blue")

ggplot(data = rna_fc, mapping = aes(x = time_4_vs_0, y = time_8_vs_0)) +
  geom_point(alpha = 0.3, aes(color = gene_biotype))

ggplot(data = rna_fc, mapping = aes(x = time_4_vs_0, y = time_8_vs_0,
                                color = gene_biotype)) +
  geom_point(alpha = 0.3)

ggplot(data = rna_fc, mapping = aes(x = time_4_vs_0, y = time_8_vs_0,
                                color = gene_biotype)) +
  geom_point(alpha = 0.3) +
  geom_abline(intercept = 0)

ggplot(data = rna_fc, mapping = aes(x = time_4_vs_0, y = time_8_vs_0,
                                color = gene_biotype)) +
  geom_jitter(alpha = 0.3) +
  geom_abline(intercept = 0)

ggplot(data = rna, mapping = aes(y = expression_log, x = sample)) +
    geom_point(aes(color = time))

ggplot(data = rna,
         mapping = aes(y = expression_log, x = sample)) +
  geom_boxplot()

ggplot(data = rna,
         mapping = aes(y = expression_log, x = sample)) +
  geom_jitter(alpha = 0.2, color = "tomato") +
  geom_boxplot(alpha = 0)

ggplot(data = rna,
         mapping = aes(y = expression_log, x = sample)) +
  geom_boxplot(alpha = 0) +
  geom_jitter(alpha = 0.2, color = "tomato")

ggplot(data = rna,
         mapping = aes(y = expression_log, x = sample)) +
  geom_jitter(alpha = 0.2, color = "tomato") +
  geom_boxplot(alpha = 0) +
  theme(axis.text.x = element_text(angle = 90,  hjust = 0.5, vjust = 0.5))

# time as integer
ggplot(data = rna,
         mapping = aes(y = expression_log,
                       x = sample)) +
  geom_jitter(alpha = 0.2, aes(color = time)) +
  geom_boxplot(alpha = 0) +
  theme(axis.text.x = element_text(angle = 90,  hjust = 0.5, vjust = 0.5))

# time as factor
ggplot(data = rna,
         mapping = aes(y = expression_log,
                       x = sample)) +
  geom_jitter(alpha = 0.2, aes(color = as.factor(time))) +
  geom_boxplot(alpha = 0) +
  theme(axis.text.x = element_text(angle = 90,  hjust = 0.5, vjust = 0.5))

ggplot(data = rna,
         mapping = aes(y = expression_log, x = sample)) +
  geom_violin(aes(fill = as.factor(time))) +
  theme(axis.text.x = element_text(angle = 90,  hjust = 0.5, vjust = 0.5))

ggplot(data = rna,
         mapping = aes(y = expression_log, x = sample)) +
  geom_violin(aes(fill = sex)) +
  theme(axis.text.x = element_text(angle = 90,  hjust = 0.5, vjust = 0.5))

rna_fc <- rna_fc %>% arrange(desc(time_8_vs_0))

genes_selected <- rna_fc$gene[1:10]

sub_rna <- rna %>%
    filter(gene %in% genes_selected)

mean_exp_by_time <- sub_rna %>%
  group_by(gene,time) %>%
  summarize(mean_exp = mean(expression_log))

`summarise()` has grouped output by 'gene'. You can override using the
`.groups` argument.

ggplot(data = mean_exp_by_time, mapping = aes(x = time, y = mean_exp)) +
  geom_line()

ggplot(data = mean_exp_by_time,
       mapping = aes(x = time, y = mean_exp, group = gene)) +
  geom_line()

ggplot(data = mean_exp_by_time,
       mapping = aes(x = time, y = mean_exp, color = gene)) +
  geom_line()

ggplot(data = mean_exp_by_time,
       mapping = aes(x = time, y = mean_exp)) + geom_line() +
  facet_wrap(~ gene)

ggplot(data = mean_exp_by_time,
       mapping = aes(x = time, y = mean_exp)) +
  geom_line() +
  facet_wrap(~ gene, scales = "free_y")

mean_exp_by_time_sex <- sub_rna %>%
  group_by(gene, time, sex) %>%
  summarize(mean_exp = mean(expression_log))

`summarise()` has grouped output by 'gene', 'time'. You can override using the
`.groups` argument.

ggplot(data = mean_exp_by_time_sex,
       mapping = aes(x = time, y = mean_exp, color = sex)) +
  geom_line() +
  facet_wrap(~ gene, scales = "free_y")

ggplot(data = mean_exp_by_time_sex,
       mapping = aes(x = time, y = mean_exp, color = sex)) +
  geom_line() +
  facet_wrap(~ gene, scales = "free_y") +
  theme_bw() +
  theme(panel.grid = element_blank())

mean_exp_by_chromosome <- rna %>%
  group_by(chromosome_name, time) %>%
  summarize(mean_exp = mean(expression_log))

`summarise()` has grouped output by 'chromosome_name'. You can override using
the `.groups` argument.

ggplot(data = mean_exp_by_chromosome, mapping = aes(x = time,
                                y = mean_exp)) +
  geom_line() +
  facet_wrap(~ chromosome_name, scales = "free_y")

# One column, facet by rows
ggplot(data = mean_exp_by_time_sex,
       mapping = aes(x = time, y = mean_exp, color = gene)) +
  geom_line() +
  facet_grid(sex ~ .)

# One row, facet by column
ggplot(data = mean_exp_by_time_sex,
       mapping = aes(x = time, y = mean_exp, color = gene)) +
  geom_line() +
  facet_grid(. ~ sex)

ggplot(data = mean_exp_by_time_sex,
       mapping = aes(x = time, y = mean_exp, color = sex)) +
  geom_line() +
  facet_wrap(~ gene, scales = "free_y") +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(title = "Mean gene expression by duration of the infection",
       x = "Duration of the infection (in days)",
       y = "Mean gene expression")

ggplot(data = mean_exp_by_time_sex,
       mapping = aes(x = time, y = mean_exp, color = sex)) +
  geom_line() +
  facet_wrap(~ gene, scales = "free_y") +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(title = "Mean gene expression by duration of the infection",
       x = "Duration of the infection (in days)",
       y = "Mean gene expression")  +
  theme(text = element_text(size = 16))

ggplot(data = mean_exp_by_time_sex,
       mapping = aes(x = time, y = mean_exp, color = sex)) +
  geom_line() +
  facet_wrap(~ gene, scales = "free_y") +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(title = "Mean gene expression by duration of the infection",
       x = "Duration of the infection (in days)",
       y = "Mean gene expression")  +
  theme(text = element_text(size = 16),
        axis.text.x = element_text(colour = "royalblue4", size = 12),
        axis.text.y = element_text(colour = "royalblue4", size = 12),
        panel.grid = element_line(colour="lightsteelblue1"),
        legend.position = "top")

blue_theme <- theme(axis.text.x = element_text(colour = "royalblue4",
                                               size = 12),
                    axis.text.y = element_text(colour = "royalblue4",
                                               size = 12),
                    text = element_text(size = 16),
                    panel.grid = element_line(colour="lightsteelblue1"))

ggplot(rna, aes(x = expression_log)) +
  geom_histogram(bins = 20) +
    blue_theme

rna$chromosome_name <- factor(rna$chromosome_name,
                               levels = c(1:19,"X","Y"))

count_gene_chromosome <- rna %>% select(chromosome_name, gene) %>%
  distinct() %>% ggplot() +
  geom_bar(aes(x = chromosome_name), fill = "seagreen",
           position = "dodge", stat = "count") +
  labs(y = "log10(n genes)", x = "chromosome") +
  scale_y_log10()

count_gene_chromosome

exp_boxplot_sex <- ggplot(rna, aes(y=expression_log, x = as.factor(time),
                 color=sex)) +
   geom_boxplot(alpha = 0) +
  labs(y = "Mean gene exp",
       x = "time") + theme(legend.position = "none")

exp_boxplot_sex

install.packages("patchwork")

library("patchwork")
count_gene_chromosome + exp_boxplot_sex

## or count_gene_chromosome | exp_boxplot_sex

count_gene_chromosome / exp_boxplot_sex

count_gene_chromosome + exp_boxplot_sex + plot_layout(ncol = 1)

count_gene_chromosome +
 (count_gene_chromosome + exp_boxplot_sex) +
 exp_boxplot_sex +
 plot_layout(ncol = 1)

count_gene_chromosome /
 (count_gene_chromosome | exp_boxplot_sex) /
 exp_boxplot_sex

install.packages("gridExtra")

library(gridExtra)
grid.arrange(count_gene_chromosome, exp_boxplot_sex, ncol = 2)

my_plot <- ggplot(data = mean_exp_by_time_sex,
       mapping = aes(x = time, y = mean_exp, color = sex)) +
  geom_line() +
  facet_wrap(~ gene, scales = "free_y") +
  labs(title = "Mean gene expression by duration of the infection",
         x = "Duration of the infection (in days)",
         y = "Mean gene expression") +
  guides(color=guide_legend(title="Gender")) +
  theme_bw() +
  theme(axis.text.x = element_text(colour = "royalblue4", size = 12),
        axis.text.y = element_text(colour = "royalblue4", size = 12),
        text = element_text(size = 16),
        panel.grid = element_line(colour="lightsteelblue1"),
        legend.position = "top")
ggsave("fig_output/mean_exp_by_time_sex.png", my_plot, width = 15,
       height = 10)

# This also works for grid.arrange() plots
combo_plot <- grid.arrange(count_gene_chromosome, exp_boxplot_sex,
                           ncol = 2, widths = c(4, 6))
ggsave("fig_output/combo_plot_chromosome_sex.png", combo_plot,
       width = 10, dpi = 300)

par(mfrow = c(1, 3))
plot(1:20, main = "First layer, produced with plot(1:20)")

plot(1:20, main = "A horizontal red line, added with abline(h = 10)")
abline(h = 10, col = "red")

plot(1:20, main = "A rectangle, added with rect(5, 5, 15, 15)")
abline(h = 10, col = "red")
rect(5, 5, 15, 15, lwd = 3)

par(mfrow = c(2, 2))
boxplot(rnorm(100),
        main = "Boxplot of rnorm(100)")
boxplot(matrix(rnorm(100), ncol = 10),
        main = "Boxplot of matrix(rnorm(100), ncol = 10)")
hist(rnorm(100))
hist(matrix(rnorm(100), ncol = 10))

if (!require("rWSBIM1207"))
    BiocManager::install("UCLouvain-CBIO/rWSBIM1207")

library("rWSBIM1207")
data(jdf)

jdf1

# A tibble: 25 × 3
   uniprot  organelle                             entry      
   <chr>    <chr>                                 <chr>      
 1 P26039   Actin cytoskeleton                    TLN1_MOUSE 
 2 Q99PL5   Endoplasmic reticulum/Golgi apparatus RRBP1_MOUSE
 3 Q6PB66   Mitochondrion                         LPPRC_MOUSE
 4 P11276   Extracellular matrix                  FINC_MOUSE 
 5 Q6PR54   Nucleus - Chromatin                   RIF1_MOUSE 
 6 Q05793   Extracellular matrix                  PGBM_MOUSE 
 7 P19096   Cytosol                               FAS_MOUSE  
 8 Q9JKF1   Plasma membrane                       IQGA1_MOUSE
 9 Q9QZQ1-2 Plasma membrane                       AFAD_MOUSE 
10 Q6NS46   Nucleus - Non-chromatin               RRP5_MOUSE 
# … with 15 more rows

jdf2

# A tibble: 25 × 4
   gene_name description                                      uniprot organism
   <chr>     <chr>                                            <chr>   <chr>   
 1 Iqgap1    Ras GTPase-activating-like protein IQGAP1        Q9JKF1  Mmus    
 2 Hspa5     78 kDa glucose-regulated protein                 P20029  Mmus    
 3 Pdcd11    Protein RRP5 homolog                             Q6NS46  Mmus    
 4 Tfrc      Transferrin receptor protein 1                   Q62351  Mmus    
 5 Hspd1     60 kDa heat shock protein, mitochondrial         P63038  Mmus    
 6 Tln1      Talin-1                                          P26039  Mmus    
 7 Smc1a     Structural maintenance of chromosomes protein 1A Q9CU62  Mmus    
 8 Lamc1     Laminin subunit gamma-1                          P02468  Mmus    
 9 Hsp90b1   Endoplasmin                                      P08113  Mmus    
10 Mia3      Melanoma inhibitory activity protein 3           Q8BI84  Mmus    
# … with 15 more rows

library("dplyr")

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

full_join(jdf1, jdf2)

Joining, by = "uniprot"

# A tibble: 25 × 6
   uniprot  organelle                       entry gene_name description organism
   <chr>    <chr>                           <chr> <chr>     <chr>       <chr>   
 1 P26039   Actin cytoskeleton              TLN1… Tln1      Talin-1     Mmus    
 2 Q99PL5   Endoplasmic reticulum/Golgi ap… RRBP… Rrbp1     Ribosome-b… Mmus    
 3 Q6PB66   Mitochondrion                   LPPR… Lrpprc    Leucine-ri… Mmus    
 4 P11276   Extracellular matrix            FINC… Fn1       Fibronectin Mmus    
 5 Q6PR54   Nucleus - Chromatin             RIF1… Rif1      Telomere-a… Mmus    
 6 Q05793   Extracellular matrix            PGBM… Hspg2     Basement m… Mmus    
 7 P19096   Cytosol                         FAS_… Fasn      Fatty acid… Mmus    
 8 Q9JKF1   Plasma membrane                 IQGA… Iqgap1    Ras GTPase… Mmus    
 9 Q9QZQ1-2 Plasma membrane                 AFAD… Mllt4     Isoform 1 … Mmus    
10 Q6NS46   Nucleus - Non-chromatin         RRP5… Pdcd11    Protein RR… Mmus    
# … with 15 more rows

jdf3

# A tibble: 25 × 4
   gene_name description                                      UniProt organism
   <chr>     <chr>                                            <chr>   <chr>   
 1 Iqgap1    Ras GTPase-activating-like protein IQGAP1        Q9JKF1  Mmus    
 2 Hspa5     78 kDa glucose-regulated protein                 P20029  Mmus    
 3 Pdcd11    Protein RRP5 homolog                             Q6NS46  Mmus    
 4 Tfrc      Transferrin receptor protein 1                   Q62351  Mmus    
 5 Hspd1     60 kDa heat shock protein, mitochondrial         P63038  Mmus    
 6 Tln1      Talin-1                                          P26039  Mmus    
 7 Smc1a     Structural maintenance of chromosomes protein 1A Q9CU62  Mmus    
 8 Lamc1     Laminin subunit gamma-1                          P02468  Mmus    
 9 Hsp90b1   Endoplasmin                                      P08113  Mmus    
10 Mia3      Melanoma inhibitory activity protein 3           Q8BI84  Mmus    
# … with 15 more rows

names(jdf3)

[1] "gene_name"   "description" "UniProt"     "organism"   

full_join(jdf1, jdf3, by = c("uniprot" = "UniProt"))

# A tibble: 25 × 6
   uniprot  organelle                       entry gene_name description organism
   <chr>    <chr>                           <chr> <chr>     <chr>       <chr>   
 1 P26039   Actin cytoskeleton              TLN1… Tln1      Talin-1     Mmus    
 2 Q99PL5   Endoplasmic reticulum/Golgi ap… RRBP… Rrbp1     Ribosome-b… Mmus    
 3 Q6PB66   Mitochondrion                   LPPR… Lrpprc    Leucine-ri… Mmus    
 4 P11276   Extracellular matrix            FINC… Fn1       Fibronectin Mmus    
 5 Q6PR54   Nucleus - Chromatin             RIF1… Rif1      Telomere-a… Mmus    
 6 Q05793   Extracellular matrix            PGBM… Hspg2     Basement m… Mmus    
 7 P19096   Cytosol                         FAS_… Fasn      Fatty acid… Mmus    
 8 Q9JKF1   Plasma membrane                 IQGA… Iqgap1    Ras GTPase… Mmus    
 9 Q9QZQ1-2 Plasma membrane                 AFAD… Mllt4     Isoform 1 … Mmus    
10 Q6NS46   Nucleus - Non-chromatin         RRP5… Pdcd11    Protein RR… Mmus    
# … with 15 more rows

full_join(jdf4, jdf5)

Joining, by = "uniprot"

# A tibble: 14 × 6
   uniprot  organelle                       entry gene_name description organism
   <chr>    <chr>                           <chr> <chr>     <chr>       <chr>   
 1 P26039   Actin cytoskeleton              TLN1… <NA>      <NA>        <NA>    
 2 Q99PL5   Endoplasmic reticulum/Golgi ap… RRBP… <NA>      <NA>        <NA>    
 3 Q6PB66   Mitochondrion                   LPPR… <NA>      <NA>        <NA>    
 4 P11276   Extracellular matrix            FINC… <NA>      <NA>        <NA>    
 5 Q6PR54   Nucleus - Chromatin             RIF1… <NA>      <NA>        <NA>    
 6 Q05793   Extracellular matrix            PGBM… <NA>      <NA>        <NA>    
 7 P19096   Cytosol                         FAS_… Fasn      Fatty acid… Mmus    
 8 Q9JKF1   Plasma membrane                 IQGA… <NA>      <NA>        <NA>    
 9 Q9QZQ1-2 Plasma membrane                 AFAD… <NA>      <NA>        <NA>    
10 Q6NS46   Nucleus - Non-chromatin         RRP5… <NA>      <NA>        <NA>    
11 P02468   <NA>                            <NA>  Lamc1     Laminin su… Mmus    
12 P08113   <NA>                            <NA>  Hsp90b1   Endoplasmin Mmus    
13 Q8BI84   <NA>                            <NA>  Mia3      Melanoma i… Mmus    
14 Q6P5D8   <NA>                            <NA>  Smchd1    Structural… Mmus    

left_join(jdf4, jdf5)

Joining, by = "uniprot"

# A tibble: 10 × 6
   uniprot  organelle                       entry gene_name description organism
   <chr>    <chr>                           <chr> <chr>     <chr>       <chr>   
 1 P26039   Actin cytoskeleton              TLN1… <NA>      <NA>        <NA>    
 2 Q99PL5   Endoplasmic reticulum/Golgi ap… RRBP… <NA>      <NA>        <NA>    
 3 Q6PB66   Mitochondrion                   LPPR… <NA>      <NA>        <NA>    
 4 P11276   Extracellular matrix            FINC… <NA>      <NA>        <NA>    
 5 Q6PR54   Nucleus - Chromatin             RIF1… <NA>      <NA>        <NA>    
 6 Q05793   Extracellular matrix            PGBM… <NA>      <NA>        <NA>    
 7 P19096   Cytosol                         FAS_… Fasn      Fatty acid… Mmus    
 8 Q9JKF1   Plasma membrane                 IQGA… <NA>      <NA>        <NA>    
 9 Q9QZQ1-2 Plasma membrane                 AFAD… <NA>      <NA>        <NA>    
10 Q6NS46   Nucleus - Non-chromatin         RRP5… <NA>      <NA>        <NA>    

right_join(jdf4, jdf5)

Joining, by = "uniprot"

# A tibble: 5 × 6
  uniprot organelle entry     gene_name description                     organism
  <chr>   <chr>     <chr>     <chr>     <chr>                           <chr>   
1 P19096  Cytosol   FAS_MOUSE Fasn      Fatty acid synthase             Mmus    
2 P02468  <NA>      <NA>      Lamc1     Laminin subunit gamma-1         Mmus    
3 P08113  <NA>      <NA>      Hsp90b1   Endoplasmin                     Mmus    
4 Q8BI84  <NA>      <NA>      Mia3      Melanoma inhibitory activity p… Mmus    
5 Q6P5D8  <NA>      <NA>      Smchd1    Structural maintenance of chro… Mmus    

inner_join(jdf4, jdf5)

Joining, by = "uniprot"

# A tibble: 1 × 6
  uniprot organelle entry     gene_name description         organism
  <chr>   <chr>     <chr>     <chr>     <chr>               <chr>   
1 P19096  Cytosol   FAS_MOUSE Fasn      Fatty acid synthase Mmus    

jdf6

# A tibble: 5 × 4
  uniprot organelle             entry       isoform
  <chr>   <chr>                 <chr>         <dbl>
1 P26039  Actin cytoskeleton    TLN1_MOUSE        1
2 Q99PL5  Endoplasmic reticulum RRBP1_MOUSE       1
3 Q99PL5  Golgi apparatus       RRBP1_MOUSE       2
4 Q6PB66  Mitochondrion         LPPRC_MOUSE       1
5 P11276  Extracellular matrix  FINC_MOUSE        1

inner_join(jdf6, jdf2)

Joining, by = "uniprot"

# A tibble: 5 × 7
  uniprot organelle             entry     isoform gene_name description organism
  <chr>   <chr>                 <chr>       <dbl> <chr>     <chr>       <chr>   
1 P26039  Actin cytoskeleton    TLN1_MOU…       1 Tln1      Talin-1     Mmus    
2 Q99PL5  Endoplasmic reticulum RRBP1_MO…       1 Rrbp1     Ribosome-b… Mmus    
3 Q99PL5  Golgi apparatus       RRBP1_MO…       2 Rrbp1     Ribosome-b… Mmus    
4 Q6PB66  Mitochondrion         LPPRC_MO…       1 Lrpprc    Leucine-ri… Mmus    
5 P11276  Extracellular matrix  FINC_MOU…       1 Fn1       Fibronectin Mmus    

jdf7

# A tibble: 5 × 6
  gene_name description                     uniprot organism isoform_num measure
  <chr>     <chr>                           <chr>   <chr>          <dbl>   <dbl>
1 Rrbp1     Ribosome-binding protein 1      Q99PL5  Mmus               1     102
2 Rrbp1     Ribosome-binding protein 1      Q99PL5  Mmus               2       3
3 Iqgap1    Ras GTPase-activating-like pro… Q9JKF1  Mmus               1      13
4 Hspa5     78 kDa glucose-regulated prote… P20029  Mmus               1      54
5 Pdcd11    Protein RRP5 homolog            Q6NS46  Mmus               1      28

inner_join(jdf6, jdf7)

Joining, by = "uniprot"

# A tibble: 4 × 9
  uniprot organelle     entry isoform gene_name description organism isoform_num
  <chr>   <chr>         <chr>   <dbl> <chr>     <chr>       <chr>          <dbl>
1 Q99PL5  Endoplasmic … RRBP…       1 Rrbp1     Ribosome-b… Mmus               1
2 Q99PL5  Endoplasmic … RRBP…       1 Rrbp1     Ribosome-b… Mmus               2
3 Q99PL5  Golgi appara… RRBP…       2 Rrbp1     Ribosome-b… Mmus               1
4 Q99PL5  Golgi appara… RRBP…       2 Rrbp1     Ribosome-b… Mmus               2
# … with 1 more variable: measure <dbl>

inner_join(jdf6, jdf7)

Joining, by = "uniprot"

# A tibble: 4 × 9
  uniprot organelle     entry isoform gene_name description organism isoform_num
  <chr>   <chr>         <chr>   <dbl> <chr>     <chr>       <chr>          <dbl>
1 Q99PL5  Endoplasmic … RRBP…       1 Rrbp1     Ribosome-b… Mmus               1
2 Q99PL5  Endoplasmic … RRBP…       1 Rrbp1     Ribosome-b… Mmus               2
3 Q99PL5  Golgi appara… RRBP…       2 Rrbp1     Ribosome-b… Mmus               1
4 Q99PL5  Golgi appara… RRBP…       2 Rrbp1     Ribosome-b… Mmus               2
# … with 1 more variable: measure <dbl>

jdf6

# A tibble: 5 × 4
  uniprot organelle             entry       isoform
  <chr>   <chr>                 <chr>         <dbl>
1 P26039  Actin cytoskeleton    TLN1_MOUSE        1
2 Q99PL5  Endoplasmic reticulum RRBP1_MOUSE       1
3 Q99PL5  Golgi apparatus       RRBP1_MOUSE       2
4 Q6PB66  Mitochondrion         LPPRC_MOUSE       1
5 P11276  Extracellular matrix  FINC_MOUSE        1

jdf7

# A tibble: 5 × 6
  gene_name description                     uniprot organism isoform_num measure
  <chr>     <chr>                           <chr>   <chr>          <dbl>   <dbl>
1 Rrbp1     Ribosome-binding protein 1      Q99PL5  Mmus               1     102
2 Rrbp1     Ribosome-binding protein 1      Q99PL5  Mmus               2       3
3 Iqgap1    Ras GTPase-activating-like pro… Q9JKF1  Mmus               1      13
4 Hspa5     78 kDa glucose-regulated prote… P20029  Mmus               1      54
5 Pdcd11    Protein RRP5 homolog            Q6NS46  Mmus               1      28

inner_join(jdf6, jdf7, by = c("uniprot" = "uniprot", "isoform" = "isoform_num"))

# A tibble: 2 × 8
  uniprot organelle         entry isoform gene_name description organism measure
  <chr>   <chr>             <chr>   <dbl> <chr>     <chr>       <chr>      <dbl>
1 Q99PL5  Endoplasmic reti… RRBP…       1 Rrbp1     Ribosome-b… Mmus         102
2 Q99PL5  Golgi apparatus   RRBP…       2 Rrbp1     Ribosome-b… Mmus           3

jdf7 %>% rename(isoform = isoform_num)

# A tibble: 5 × 6
  gene_name description                         uniprot organism isoform measure
  <chr>     <chr>                               <chr>   <chr>      <dbl>   <dbl>
1 Rrbp1     Ribosome-binding protein 1          Q99PL5  Mmus           1     102
2 Rrbp1     Ribosome-binding protein 1          Q99PL5  Mmus           2       3
3 Iqgap1    Ras GTPase-activating-like protein… Q9JKF1  Mmus           1      13
4 Hspa5     78 kDa glucose-regulated protein    P20029  Mmus           1      54
5 Pdcd11    Protein RRP5 homolog                Q6NS46  Mmus           1      28

inner_join(jdf6,
           jdf7 %>%
             rename(isoform = isoform_num))

Joining, by = c("uniprot", "isoform")

# A tibble: 2 × 8
  uniprot organelle         entry isoform gene_name description organism measure
  <chr>   <chr>             <chr>   <dbl> <chr>     <chr>       <chr>      <dbl>
1 Q99PL5  Endoplasmic reti… RRBP…       1 Rrbp1     Ribosome-b… Mmus         102
2 Q99PL5  Golgi apparatus   RRBP…       2 Rrbp1     Ribosome-b… Mmus           3

d2

  a b
1 4 4
2 5 5

d3

  v1 v2 v3
1  1  3  5
2  2  4  6

cbind(d2, d3)

  a b v1 v2 v3
1 4 4  1  3  5
2 5 5  2  4  6

d1

  x y
1 1 1
2 2 2
3 3 3

d2

  a b
1 4 4
2 5 5

names(d2) <- names(d1)
d1

  x y
1 1 1
2 2 2
3 3 3

d2

  x y
1 4 4
2 5 5

rbind(d1, d2)

  x y
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5

se <- readRDS("course-data/data/GSE96870/se2.rds")

se

class: RangedSummarizedExperiment 
dim: 1474 22 
metadata(0):
assays(1): counts
rownames(1474): Asl Apod ... Lmx1a Pbx1
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(22): GSM2545336 GSM2545337 ... GSM2545363 GSM2545380
colData names(10): title geo_accession ... tissue mouse

head(assay(se))

        GSM2545336 GSM2545337 GSM2545338 GSM2545339 GSM2545340 GSM2545341
Asl           1170        361        400        586        626        988
Apod         36194      10347       9173      10620      13021      29594
Cyp2d22       4060       1616       1603       1901       2171       3349
Klk6           287        629        641        578        448        195
Fcrls           85        233        244        237        180         38
Slc2a4         782        231        248        265        313        786
        GSM2545342 GSM2545343 GSM2545344 GSM2545345 GSM2545346 GSM2545347
Asl            836        535        586        597        938       1035
Apod         24959      13668      13230      15868      27769      34301
Cyp2d22       3122       2008       2254       2277       2985       3452
Klk6           186       1101        537        567        327        233
Fcrls           68        375        199        177         89         67
Slc2a4         528        249        266        357        654        693
        GSM2545348 GSM2545349 GSM2545350 GSM2545351 GSM2545352 GSM2545353
Asl            494        481        666        937        803        541
Apod         11258      11812      15816      29242      20415      13682
Cyp2d22       1883       2014       2417       3678       2920       2216
Klk6           742        881        828        250        798        710
Fcrls          300        233        231         81        303        285
Slc2a4         271        304        349        715        513        320
        GSM2545354 GSM2545362 GSM2545363 GSM2545380
Asl            473        748        576       1192
Apod         11088      15916      11166      38148
Cyp2d22       1821       2842       2011       4019
Klk6           894        501        598        259
Fcrls          248        179        184         68
Slc2a4         248        350        317        796

dim(assay(se))

[1] 1474   22

colData(se)

DataFrame with 22 rows and 10 columns
                     title geo_accession     organism         age      sex
               <character>   <character>  <character> <character> <factor>
GSM2545336 CNS_RNA-seq_10C    GSM2545336 Mus musculus     8 weeks   Female
GSM2545337 CNS_RNA-seq_11C    GSM2545337 Mus musculus     8 weeks   Female
GSM2545338 CNS_RNA-seq_12C    GSM2545338 Mus musculus     8 weeks   Female
GSM2545339 CNS_RNA-seq_13C    GSM2545339 Mus musculus     8 weeks   Female
GSM2545340 CNS_RNA-seq_14C    GSM2545340 Mus musculus     8 weeks   Male  
...                    ...           ...          ...         ...      ...
GSM2545353  CNS_RNA-seq_3C    GSM2545353 Mus musculus     8 weeks   Female
GSM2545354  CNS_RNA-seq_4C    GSM2545354 Mus musculus     8 weeks   Male  
GSM2545362  CNS_RNA-seq_5C    GSM2545362 Mus musculus     8 weeks   Female
GSM2545363  CNS_RNA-seq_6C    GSM2545363 Mus musculus     8 weeks   Male  
GSM2545380  CNS_RNA-seq_9C    GSM2545380 Mus musculus     8 weeks   Female
             infection      strain     time     tissue    mouse
              <factor> <character> <factor>   <factor> <factor>
GSM2545336 InfluenzaA      C57BL/6     Day8 Cerebellum       14
GSM2545337 NonInfected     C57BL/6     Day0 Cerebellum       9 
GSM2545338 NonInfected     C57BL/6     Day0 Cerebellum       10
GSM2545339 InfluenzaA      C57BL/6     Day4 Cerebellum       15
GSM2545340 InfluenzaA      C57BL/6     Day4 Cerebellum       18
...                ...         ...      ...        ...      ...
GSM2545353 NonInfected     C57BL/6     Day0 Cerebellum       4 
GSM2545354 NonInfected     C57BL/6     Day0 Cerebellum       2 
GSM2545362 InfluenzaA      C57BL/6     Day4 Cerebellum       20
GSM2545363 InfluenzaA      C57BL/6     Day4 Cerebellum       12
GSM2545380 InfluenzaA      C57BL/6     Day8 Cerebellum       19

dim(colData(se))

[1] 22 10

head(rowData(se))

DataFrame with 6 rows and 11 columns
               gene    ENTREZID                product       gbkey
        <character> <character>            <character> <character>
Asl             Asl      109900 argininosuccinate ly..        mRNA
Apod           Apod       11815 apolipoprotein D, tr..        mRNA
Cyp2d22     Cyp2d22       56448 cytochrome P450, fam..        mRNA
Klk6           Klk6       19144 kallikrein related-p..        mRNA
Fcrls         Fcrls       80891 Fc receptor-like S, ..        mRNA
Slc2a4       Slc2a4       20528 solute carrier famil..        mRNA
        external_gene_name    ensembl_gene_id external_synonym chromosome_name
               <character>        <character>      <character>     <character>
Asl                    Asl ENSMUSG00000025533    2510006M18Rik               5
Apod                  Apod ENSMUSG00000022548               NA              16
Cyp2d22            Cyp2d22 ENSMUSG00000061740             2D22              15
Klk6                  Klk6 ENSMUSG00000050063             Bssp               7
Fcrls                Fcrls ENSMUSG00000015852    2810439C17Rik               3
Slc2a4              Slc2a4 ENSMUSG00000018566           Glut-4              11
          gene_biotype  phenotype_description
           <character>            <character>
Asl     protein_coding abnormal circulating..
Apod    protein_coding abnormal lipid homeo..
Cyp2d22 protein_coding abnormal skin morpho..
Klk6    protein_coding abnormal cytokine le..
Fcrls   protein_coding decreased CD8-positi..
Slc2a4  protein_coding abnormal circulating..
        hsapiens_homolog_associated_gene_name
                                  <character>
Asl                                       ASL
Apod                                     APOD
Cyp2d22                                CYP2D6
Klk6                                     KLK6
Fcrls                                   FCRL2
Slc2a4                                 SLC2A4

dim(rowData(se))

[1] 1474   11

rowRanges(se)

GRanges object with 1474 ranges and 11 metadata columns:
          seqnames              ranges strand |        gene    ENTREZID
             <Rle>           <IRanges>  <Rle> | <character> <character>
      Asl        5 130024208-130024330      - |         Asl      109900
     Apod       16   31314552-31314808      - |        Apod       11815
  Cyp2d22       15   82380078-82380260      - |     Cyp2d22       56448
     Klk6        7   43824544-43824595      + |        Klk6       19144
    Fcrls        3   87263678-87263765      - |       Fcrls       80891
      ...      ...                 ...    ... .         ...         ...
    Mgst3        1 167393751-167393797      - |       Mgst3       66447
   Lrrc52        1 167466093-167466780      - |      Lrrc52      240899
     Rxrg        1 167598362-167598800      + |        Rxrg       20183
    Lmx1a        1 167688300-167688991      + |       Lmx1a      110648
     Pbx1        1 168431314-168432169      - |        Pbx1       18514
                         product       gbkey external_gene_name
                     <character> <character>        <character>
      Asl argininosuccinate ly..        mRNA                Asl
     Apod apolipoprotein D, tr..        mRNA               Apod
  Cyp2d22 cytochrome P450, fam..        mRNA            Cyp2d22
     Klk6 kallikrein related-p..        mRNA               Klk6
    Fcrls Fc receptor-like S, ..        mRNA              Fcrls
      ...                    ...         ...                ...
    Mgst3 microsomal glutathio..        mRNA              Mgst3
   Lrrc52 leucine rich repeat ..        mRNA             Lrrc52
     Rxrg retinoid X receptor ..        mRNA               Rxrg
    Lmx1a LIM homeobox transcr..        mRNA              Lmx1a
     Pbx1 pre B cell leukemia ..        mRNA               Pbx1
             ensembl_gene_id external_synonym chromosome_name   gene_biotype
                 <character>      <character>     <character>    <character>
      Asl ENSMUSG00000025533    2510006M18Rik               5 protein_coding
     Apod ENSMUSG00000022548             <NA>              16 protein_coding
  Cyp2d22 ENSMUSG00000061740             2D22              15 protein_coding
     Klk6 ENSMUSG00000050063             Bssp               7 protein_coding
    Fcrls ENSMUSG00000015852    2810439C17Rik               3 protein_coding
      ...                ...              ...             ...            ...
    Mgst3 ENSMUSG00000026688    2010012L10Rik               1 protein_coding
   Lrrc52 ENSMUSG00000040485    4930413P14Rik               1 protein_coding
     Rxrg ENSMUSG00000015843            Nr2b3               1 protein_coding
    Lmx1a ENSMUSG00000026686           Lmx1.1               1 protein_coding
     Pbx1 ENSMUSG00000052534    2310056B04Rik               1 protein_coding
           phenotype_description hsapiens_homolog_associated_gene_name
                     <character>                           <character>
      Asl abnormal circulating..                                   ASL
     Apod abnormal lipid homeo..                                  APOD
  Cyp2d22 abnormal skin morpho..                                CYP2D6
     Klk6 abnormal cytokine le..                                  KLK6
    Fcrls decreased CD8-positi..                                 FCRL2
      ...                    ...                                   ...
    Mgst3 decreased mean corpu..                                 MGST3
   Lrrc52 abnormal sperm physi..                                LRRC52
     Rxrg abnormal bone minera..                                  RXRG
    Lmx1a abnormal bony labyri..                                 LMX1A
     Pbx1 abnormal adrenal gla..                                  PBX1
  -------
  seqinfo: 182 sequences from an unspecified genome; no seqlengths

count_matrix <- assay(se)
sample_metadata <- colData(se)
gene_metadata <- rowRanges(se) # or rowData(se)

# BiocManager::install("SummarizedExperiment")
library("SummarizedExperiment")

se_created <- SummarizedExperiment(assays = SimpleList(counts=count_matrix),
                           colData = sample_metadata,
                           rowRanges = gene_metadata) # or rowData =
se_created

class: RangedSummarizedExperiment 
dim: 1474 22 
metadata(0):
assays(1): counts
rownames(1474): Asl Apod ... Lmx1a Pbx1
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(22): GSM2545336 GSM2545337 ... GSM2545363 GSM2545380
colData names(10): title geo_accession ... tissue mouse

saveRDS(se_created, file = "data_output/se_created.rds")

se[1:5, ]

class: RangedSummarizedExperiment 
dim: 5 22 
metadata(0):
assays(1): counts
rownames(5): Asl Apod Cyp2d22 Klk6 Fcrls
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(22): GSM2545336 GSM2545337 ... GSM2545363 GSM2545380
colData names(10): title geo_accession ... tissue mouse

se[, 1:3]

class: RangedSummarizedExperiment 
dim: 1474 3 
metadata(0):
assays(1): counts
rownames(1474): Asl Apod ... Lmx1a Pbx1
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(3): GSM2545336 GSM2545337 GSM2545338
colData names(10): title geo_accession ... tissue mouse

se1 <- se[1:5, 1:3]
se1

class: RangedSummarizedExperiment 
dim: 5 3 
metadata(0):
assays(1): counts
rownames(5): Asl Apod Cyp2d22 Klk6 Fcrls
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(3): GSM2545336 GSM2545337 GSM2545338
colData names(10): title geo_accession ... tissue mouse

assay(se1)

        GSM2545336 GSM2545337 GSM2545338
Asl           1170        361        400
Apod         36194      10347       9173
Cyp2d22       4060       1616       1603
Klk6           287        629        641
Fcrls           85        233        244

colData(se1)

DataFrame with 3 rows and 10 columns
                     title geo_accession     organism         age      sex
               <character>   <character>  <character> <character> <factor>
GSM2545336 CNS_RNA-seq_10C    GSM2545336 Mus musculus     8 weeks   Female
GSM2545337 CNS_RNA-seq_11C    GSM2545337 Mus musculus     8 weeks   Female
GSM2545338 CNS_RNA-seq_12C    GSM2545338 Mus musculus     8 weeks   Female
             infection      strain     time     tissue    mouse
              <factor> <character> <factor>   <factor> <factor>
GSM2545336 InfluenzaA      C57BL/6     Day8 Cerebellum       14
GSM2545337 NonInfected     C57BL/6     Day0 Cerebellum       9 
GSM2545338 NonInfected     C57BL/6     Day0 Cerebellum       10

rowData(se1)

DataFrame with 5 rows and 11 columns
               gene    ENTREZID                product       gbkey
        <character> <character>            <character> <character>
Asl             Asl      109900 argininosuccinate ly..        mRNA
Apod           Apod       11815 apolipoprotein D, tr..        mRNA
Cyp2d22     Cyp2d22       56448 cytochrome P450, fam..        mRNA
Klk6           Klk6       19144 kallikrein related-p..        mRNA
Fcrls         Fcrls       80891 Fc receptor-like S, ..        mRNA
        external_gene_name    ensembl_gene_id external_synonym chromosome_name
               <character>        <character>      <character>     <character>
Asl                    Asl ENSMUSG00000025533    2510006M18Rik               5
Apod                  Apod ENSMUSG00000022548               NA              16
Cyp2d22            Cyp2d22 ENSMUSG00000061740             2D22              15
Klk6                  Klk6 ENSMUSG00000050063             Bssp               7
Fcrls                Fcrls ENSMUSG00000015852    2810439C17Rik               3
          gene_biotype  phenotype_description
           <character>            <character>
Asl     protein_coding abnormal circulating..
Apod    protein_coding abnormal lipid homeo..
Cyp2d22 protein_coding abnormal skin morpho..
Klk6    protein_coding abnormal cytokine le..
Fcrls   protein_coding decreased CD8-positi..
        hsapiens_homolog_associated_gene_name
                                  <character>
Asl                                       ASL
Apod                                     APOD
Cyp2d22                                CYP2D6
Klk6                                     KLK6
Fcrls                                   FCRL2

se1 <- se[rowData(se)$gene_biotype == "miRNA",
          colData(se)$infection == "NonInfected"]
se1

class: RangedSummarizedExperiment 
dim: 7 7 
metadata(0):
assays(1): counts
rownames(7): Mir1901 Mir378a ... Mir128-1 Mir7682
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(7): GSM2545337 GSM2545338 ... GSM2545353 GSM2545354
colData names(10): title geo_accession ... tissue mouse

assay(se1)

         GSM2545337 GSM2545338 GSM2545343 GSM2545348 GSM2545349 GSM2545353
Mir1901          45         44         74         55         68         33
Mir378a          11          7          9          4         12          4
Mir133b           4          6          5          4          6          7
Mir30c-2         10          6         16         12          8         17
Mir149            1          2          0          0          0          0
Mir128-1          4          1          2          2          1          2
Mir7682           2          0          4          1          3          5
         GSM2545354
Mir1901          60
Mir378a           8
Mir133b           3
Mir30c-2         15
Mir149            2
Mir128-1          1
Mir7682           5

colData(se1)

DataFrame with 7 rows and 10 columns
                     title geo_accession     organism         age      sex
               <character>   <character>  <character> <character> <factor>
GSM2545337 CNS_RNA-seq_11C    GSM2545337 Mus musculus     8 weeks   Female
GSM2545338 CNS_RNA-seq_12C    GSM2545338 Mus musculus     8 weeks   Female
GSM2545343 CNS_RNA-seq_20C    GSM2545343 Mus musculus     8 weeks   Male  
GSM2545348 CNS_RNA-seq_27C    GSM2545348 Mus musculus     8 weeks   Female
GSM2545349 CNS_RNA-seq_28C    GSM2545349 Mus musculus     8 weeks   Male  
GSM2545353  CNS_RNA-seq_3C    GSM2545353 Mus musculus     8 weeks   Female
GSM2545354  CNS_RNA-seq_4C    GSM2545354 Mus musculus     8 weeks   Male  
             infection      strain     time     tissue    mouse
              <factor> <character> <factor>   <factor> <factor>
GSM2545337 NonInfected     C57BL/6     Day0 Cerebellum       9 
GSM2545338 NonInfected     C57BL/6     Day0 Cerebellum       10
GSM2545343 NonInfected     C57BL/6     Day0 Cerebellum       11
GSM2545348 NonInfected     C57BL/6     Day0 Cerebellum       8 
GSM2545349 NonInfected     C57BL/6     Day0 Cerebellum       7 
GSM2545353 NonInfected     C57BL/6     Day0 Cerebellum       4 
GSM2545354 NonInfected     C57BL/6     Day0 Cerebellum       2 

rowData(se1)

DataFrame with 7 rows and 11 columns
                gene    ENTREZID        product         gbkey
         <character> <character>    <character>   <character>
Mir1901      Mir1901   100316686  microRNA 1901 precursor_RNA
Mir378a      Mir378a      723889  microRNA 378a precursor_RNA
Mir133b      Mir133b      723817  microRNA 133b precursor_RNA
Mir30c-2    Mir30c-2      723964 microRNA 30c-2 precursor_RNA
Mir149        Mir149      387167   microRNA 149 precursor_RNA
Mir128-1    Mir128-1      387147 microRNA 128-1 precursor_RNA
Mir7682      Mir7682   102466847  microRNA 7682 precursor_RNA
         external_gene_name    ensembl_gene_id external_synonym chromosome_name
                <character>        <character>      <character>     <character>
Mir1901             Mir1901 ENSMUSG00000084565         Mirn1901              18
Mir378a             Mir378a ENSMUSG00000105200          Mirn378              18
Mir133b             Mir133b ENSMUSG00000065480         mir 133b               1
Mir30c-2           Mir30c-2 ENSMUSG00000065567        mir 30c-2               1
Mir149               Mir149 ENSMUSG00000065470          Mirn149               1
Mir128-1           Mir128-1 ENSMUSG00000065520          Mirn128               1
Mir7682             Mir7682 ENSMUSG00000106406     mmu-mir-7682               1
         gene_biotype  phenotype_description
          <character>            <character>
Mir1901         miRNA                     NA
Mir378a         miRNA abnormal mitochondri..
Mir133b         miRNA no abnormal phenotyp..
Mir30c-2        miRNA                     NA
Mir149          miRNA increased circulatin..
Mir128-1        miRNA no abnormal phenotyp..
Mir7682         miRNA                     NA
         hsapiens_homolog_associated_gene_name
                                   <character>
Mir1901                                     NA
Mir378a                                MIR378A
Mir133b                                MIR133B
Mir30c-2                               MIR30C2
Mir149                                      NA
Mir128-1                              MIR128-1
Mir7682                                     NA

# Define a region of interest e.g. interval 1 to 10,000,000 of chromosome 1 
roi <- GRanges(seqnames="1", ranges=1:10000000)
# Subset se object for only rows in the region of interest
subsetByOverlaps(se, roi)

class: RangedSummarizedExperiment 
dim: 20 22 
metadata(0):
assays(1): counts
rownames(20): Sox17 Rp1 ... Snord87 Tcf24
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(22): GSM2545336 GSM2545337 ... GSM2545363 GSM2545380
colData names(10): title geo_accession ... tissue mouse

assay(se)[1:3, colData(se)$time != "Day4"]

        GSM2545336 GSM2545337 GSM2545338 GSM2545341 GSM2545342 GSM2545343
Asl           1170        361        400        988        836        535
Apod         36194      10347       9173      29594      24959      13668
Cyp2d22       4060       1616       1603       3349       3122       2008
        GSM2545346 GSM2545347 GSM2545348 GSM2545349 GSM2545351 GSM2545353
Asl            938       1035        494        481        937        541
Apod         27769      34301      11258      11812      29242      13682
Cyp2d22       2985       3452       1883       2014       3678       2216
        GSM2545354 GSM2545380
Asl            473       1192
Apod         11088      38148
Cyp2d22       1821       4019

# Equivalent to
assay(se)[1:3, colData(se)$time == "Day0" | colData(se)$time == "Day8"]

        GSM2545336 GSM2545337 GSM2545338 GSM2545341 GSM2545342 GSM2545343
Asl           1170        361        400        988        836        535
Apod         36194      10347       9173      29594      24959      13668
Cyp2d22       4060       1616       1603       3349       3122       2008
        GSM2545346 GSM2545347 GSM2545348 GSM2545349 GSM2545351 GSM2545353
Asl            938       1035        494        481        937        541
Apod         27769      34301      11258      11812      29242      13682
Cyp2d22       2985       3452       1883       2014       3678       2216
        GSM2545354 GSM2545380
Asl            473       1192
Apod         11088      38148
Cyp2d22       1821       4019

roi <- GRanges(seqnames="Y", ranges = 1:10000000, strand = "-")
subsetByOverlaps(se, roi)

class: RangedSummarizedExperiment 
dim: 3 22 
metadata(0):
assays(1): counts
rownames(3): Ddx3y Uty Gm4017
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(22): GSM2545336 GSM2545337 ... GSM2545363 GSM2545380
colData names(10): title geo_accession ... tissue mouse

colData(se)$center <- rep("University of Illinois", nrow(colData(se)))
colData(se)

DataFrame with 22 rows and 11 columns
                     title geo_accession     organism         age      sex
               <character>   <character>  <character> <character> <factor>
GSM2545336 CNS_RNA-seq_10C    GSM2545336 Mus musculus     8 weeks   Female
GSM2545337 CNS_RNA-seq_11C    GSM2545337 Mus musculus     8 weeks   Female
GSM2545338 CNS_RNA-seq_12C    GSM2545338 Mus musculus     8 weeks   Female
GSM2545339 CNS_RNA-seq_13C    GSM2545339 Mus musculus     8 weeks   Female
GSM2545340 CNS_RNA-seq_14C    GSM2545340 Mus musculus     8 weeks   Male  
...                    ...           ...          ...         ...      ...
GSM2545353  CNS_RNA-seq_3C    GSM2545353 Mus musculus     8 weeks   Female
GSM2545354  CNS_RNA-seq_4C    GSM2545354 Mus musculus     8 weeks   Male  
GSM2545362  CNS_RNA-seq_5C    GSM2545362 Mus musculus     8 weeks   Female
GSM2545363  CNS_RNA-seq_6C    GSM2545363 Mus musculus     8 weeks   Male  
GSM2545380  CNS_RNA-seq_9C    GSM2545380 Mus musculus     8 weeks   Female
             infection      strain     time     tissue    mouse
              <factor> <character> <factor>   <factor> <factor>
GSM2545336 InfluenzaA      C57BL/6     Day8 Cerebellum       14
GSM2545337 NonInfected     C57BL/6     Day0 Cerebellum       9 
GSM2545338 NonInfected     C57BL/6     Day0 Cerebellum       10
GSM2545339 InfluenzaA      C57BL/6     Day4 Cerebellum       15
GSM2545340 InfluenzaA      C57BL/6     Day4 Cerebellum       18
...                ...         ...      ...        ...      ...
GSM2545353 NonInfected     C57BL/6     Day0 Cerebellum       4 
GSM2545354 NonInfected     C57BL/6     Day0 Cerebellum       2 
GSM2545362 InfluenzaA      C57BL/6     Day4 Cerebellum       20
GSM2545363 InfluenzaA      C57BL/6     Day4 Cerebellum       12
GSM2545380 InfluenzaA      C57BL/6     Day8 Cerebellum       19
                           center
                      <character>
GSM2545336 University of Illinois
GSM2545337 University of Illinois
GSM2545338 University of Illinois
GSM2545339 University of Illinois
GSM2545340 University of Illinois
...                           ...
GSM2545353 University of Illinois
GSM2545354 University of Illinois
GSM2545362 University of Illinois
GSM2545363 University of Illinois
GSM2545380 University of Illinois

se

class: RangedSummarizedExperiment 
dim: 1474 22 
metadata(0):
assays(1): counts
rownames(1474): Asl Apod ... Lmx1a Pbx1
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(22): GSM2545336 GSM2545337 ... GSM2545363 GSM2545380
colData names(11): title geo_accession ... mouse center

# BiocManager::install("tidySummarizedExperiment")
library("tidyverse")
library("tidySummarizedExperiment")

se

# A SummarizedExperiment-tibble abstraction: 32,428 × 22
# [90mFeatures=1474 | Samples=22 | Assays=counts[0m
   .feature .sample    counts title geo_accession organism age   sex   infection
   <chr>    <chr>       <int> <chr> <chr>         <chr>    <chr> <fct> <fct>    
 1 Asl      GSM2545336   1170 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 2 Apod     GSM2545336  36194 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 3 Cyp2d22  GSM2545336   4060 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 4 Klk6     GSM2545336    287 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 5 Fcrls    GSM2545336     85 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 6 Slc2a4   GSM2545336    782 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 7 Exd2     GSM2545336   1619 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 8 Gjc2     GSM2545336    288 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 9 Plp1     GSM2545336  43217 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
10 Gnb4     GSM2545336   1071 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
# … with 40 more rows, and 21 more variables: strain <chr>, time <fct>,
#   tissue <fct>, mouse <fct>, center <chr>, gene <chr>, ENTREZID <chr>,
#   product <chr>, gbkey <chr>, external_gene_name <chr>,
#   ensembl_gene_id <chr>, external_synonym <chr>, chromosome_name <chr>,
#   gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>, seqnames <fct>, start <int>,
#   end <int>, width <int>, strand <fct>

options("restore_SummarizedExperiment_show" = TRUE)
se

class: RangedSummarizedExperiment 
dim: 1474 22 
metadata(0):
assays(1): counts
rownames(1474): Asl Apod ... Lmx1a Pbx1
rowData names(11): gene ENTREZID ... phenotype_description
  hsapiens_homolog_associated_gene_name
colnames(22): GSM2545336 GSM2545337 ... GSM2545363 GSM2545380
colData names(11): title geo_accession ... mouse center

options("restore_SummarizedExperiment_show" = FALSE)
se

# A SummarizedExperiment-tibble abstraction: 32,428 × 22
# [90mFeatures=1474 | Samples=22 | Assays=counts[0m
   .feature .sample    counts title geo_accession organism age   sex   infection
   <chr>    <chr>       <int> <chr> <chr>         <chr>    <chr> <fct> <fct>    
 1 Asl      GSM2545336   1170 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 2 Apod     GSM2545336  36194 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 3 Cyp2d22  GSM2545336   4060 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 4 Klk6     GSM2545336    287 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 5 Fcrls    GSM2545336     85 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 6 Slc2a4   GSM2545336    782 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 7 Exd2     GSM2545336   1619 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 8 Gjc2     GSM2545336    288 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 9 Plp1     GSM2545336  43217 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
10 Gnb4     GSM2545336   1071 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
# … with 40 more rows, and 21 more variables: strain <chr>, time <fct>,
#   tissue <fct>, mouse <fct>, center <chr>, gene <chr>, ENTREZID <chr>,
#   product <chr>, gbkey <chr>, external_gene_name <chr>,
#   ensembl_gene_id <chr>, external_synonym <chr>, chromosome_name <chr>,
#   gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>, seqnames <fct>, start <int>,
#   end <int>, width <int>, strand <fct>

se %>% 
    filter(.sample != "GSM2545336")

# A SummarizedExperiment-tibble abstraction: 30,954 × 21
# [90mFeatures=1474 | Samples=21 | Assays=counts[0m
   .feature .sample    counts title geo_accession organism age   sex   infection
   <chr>    <chr>       <int> <chr> <chr>         <chr>    <chr> <fct> <fct>    
 1 Asl      GSM2545337    361 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 2 Apod     GSM2545337  10347 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 3 Cyp2d22  GSM2545337   1616 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 4 Klk6     GSM2545337    629 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 5 Fcrls    GSM2545337    233 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 6 Slc2a4   GSM2545337    231 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 7 Exd2     GSM2545337   2288 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 8 Gjc2     GSM2545337    595 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 9 Plp1     GSM2545337 101241 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
10 Gnb4     GSM2545337   1791 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
# … with 40 more rows, and 21 more variables: strain <chr>, time <fct>,
#   tissue <fct>, mouse <fct>, center <chr>, gene <chr>, ENTREZID <chr>,
#   product <chr>, gbkey <chr>, external_gene_name <chr>,
#   ensembl_gene_id <chr>, external_synonym <chr>, chromosome_name <chr>,
#   gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>, seqnames <fct>, start <int>,
#   end <int>, width <int>, strand <fct>

se %>% 
    select(.sample)

tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.

# A tibble: 32,428 × 1
   .sample   
   <chr>     
 1 GSM2545336
 2 GSM2545336
 3 GSM2545336
 4 GSM2545336
 5 GSM2545336
 6 GSM2545336
 7 GSM2545336
 8 GSM2545336
 9 GSM2545336
10 GSM2545336
# … with 32,418 more rows

se %>%
    count(.sample)

tidySummarizedExperiment says: A data frame is returned for independent data analysis.

# A tibble: 22 × 2
   .sample        n
   <chr>      <int>
 1 GSM2545336  1474
 2 GSM2545337  1474
 3 GSM2545338  1474
 4 GSM2545339  1474
 5 GSM2545340  1474
 6 GSM2545341  1474
 7 GSM2545342  1474
 8 GSM2545343  1474
 9 GSM2545344  1474
10 GSM2545345  1474
# … with 12 more rows

se %>%
    mutate(center = "University of Melbourne")

# A SummarizedExperiment-tibble abstraction: 32,428 × 22
# [90mFeatures=1474 | Samples=22 | Assays=counts[0m
   .feature .sample    counts title geo_accession organism age   sex   infection
   <chr>    <chr>       <int> <chr> <chr>         <chr>    <chr> <fct> <fct>    
 1 Asl      GSM2545336   1170 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 2 Apod     GSM2545336  36194 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 3 Cyp2d22  GSM2545336   4060 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 4 Klk6     GSM2545336    287 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 5 Fcrls    GSM2545336     85 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 6 Slc2a4   GSM2545336    782 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 7 Exd2     GSM2545336   1619 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 8 Gjc2     GSM2545336    288 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 9 Plp1     GSM2545336  43217 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
10 Gnb4     GSM2545336   1071 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
# … with 40 more rows, and 21 more variables: strain <chr>, time <fct>,
#   tissue <fct>, mouse <fct>, center <chr>, gene <chr>, ENTREZID <chr>,
#   product <chr>, gbkey <chr>, external_gene_name <chr>,
#   ensembl_gene_id <chr>, external_synonym <chr>, chromosome_name <chr>,
#   gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>, seqnames <fct>, start <int>,
#   end <int>, width <int>, strand <fct>

se %>%
    group_by(.sample) %>%
    summarise(total_counts=sum(counts))

tidySummarizedExperiment says: A data frame is returned for independent data analysis.

# A tibble: 22 × 2
   .sample    total_counts
   <chr>             <int>
 1 GSM2545336      3039671
 2 GSM2545337      2602360
 3 GSM2545338      2458618
 4 GSM2545339      2500082
 5 GSM2545340      2479024
 6 GSM2545341      2413723
 7 GSM2545342      2349728
 8 GSM2545343      3105652
 9 GSM2545344      2524137
10 GSM2545345      2506038
# … with 12 more rows

se %>% 
    slice(1)

# A SummarizedExperiment-tibble abstraction: 1 × 1
# [90mFeatures=1 | Samples=1 | Assays=counts[0m
  .feature .sample    counts title  geo_accession organism age   sex   infection
  <chr>    <chr>       <int> <chr>  <chr>         <chr>    <chr> <fct> <fct>    
1 Asl      GSM2545336   1170 CNS_R… GSM2545336    Mus mus… 8 we… Fema… Influenz…
# … with 21 more variables: strain <chr>, time <fct>, tissue <fct>,
#   mouse <fct>, center <chr>, gene <chr>, ENTREZID <chr>, product <chr>,
#   gbkey <chr>, external_gene_name <chr>, ensembl_gene_id <chr>,
#   external_synonym <chr>, chromosome_name <chr>, gene_biotype <chr>,
#   phenotype_description <chr>, hsapiens_homolog_associated_gene_name <chr>,
#   seqnames <fct>, start <int>, end <int>, width <int>, strand <fct>

se %>%
    distinct(.sample, infection, sex)

tidySummarizedExperiment says: A data frame is returned for independent data analysis.

# A tibble: 22 × 3
   .sample    sex    infection  
   <chr>      <fct>  <fct>      
 1 GSM2545336 Female InfluenzaA 
 2 GSM2545337 Female NonInfected
 3 GSM2545338 Female NonInfected
 4 GSM2545339 Female InfluenzaA 
 5 GSM2545340 Male   InfluenzaA 
 6 GSM2545341 Male   InfluenzaA 
 7 GSM2545342 Female InfluenzaA 
 8 GSM2545343 Male   NonInfected
 9 GSM2545344 Female InfluenzaA 
10 GSM2545345 Male   InfluenzaA 
# … with 12 more rows

se %>%
    unite("group", c(infection, time))

tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.

# A SummarizedExperiment-tibble abstraction: 32,428 × 22
# [90mFeatures=1474 | Samples=22 | Assays=counts[0m
   .feature .sample counts title geo_accession organism age   sex   group strain
   <chr>    <chr>    <int> <chr> <chr>         <chr>    <chr> <fct> <chr> <chr> 
 1 Asl      GSM254…   1170 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
 2 Apod     GSM254…  36194 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
 3 Cyp2d22  GSM254…   4060 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
 4 Klk6     GSM254…    287 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
 5 Fcrls    GSM254…     85 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
 6 Slc2a4   GSM254…    782 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
 7 Exd2     GSM254…   1619 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
 8 Gjc2     GSM254…    288 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
 9 Plp1     GSM254…  43217 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
10 Gnb4     GSM254…   1071 CNS_… GSM2545336    Mus mus… 8 we… Fema… Infl… C57BL…
# … with 40 more rows, and 19 more variables: tissue <fct>, mouse <fct>,
#   center <chr>, gene <chr>, ENTREZID <chr>, product <chr>, gbkey <chr>,
#   external_gene_name <chr>, ensembl_gene_id <chr>, external_synonym <chr>,
#   chromosome_name <chr>, gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>, seqnames <fct>, start <int>,
#   end <int>, width <int>, strand <fct>

se %>%
    ggplot(aes(counts + 1, group=.sample, color=infection)) +
    geom_density() +
    scale_x_log10() +
    theme_bw()

se %>% 
filter(gene_biotype == "miRNA" & infection == "NonInfected")

# A SummarizedExperiment-tibble abstraction: 49 × 7
# [90mFeatures=7 | Samples=7 | Assays=counts[0m
   .feature .sample    counts title geo_accession organism age   sex   infection
   <chr>    <chr>       <int> <chr> <chr>         <chr>    <chr> <fct> <fct>    
 1 Mir1901  GSM2545337     45 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 2 Mir378a  GSM2545337     11 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 3 Mir133b  GSM2545337      4 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 4 Mir30c-2 GSM2545337     10 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 5 Mir149   GSM2545337      1 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 6 Mir128-1 GSM2545337      4 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 7 Mir7682  GSM2545337      2 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 8 Mir1901  GSM2545338     44 CNS_… GSM2545338    Mus mus… 8 we… Fema… NonInfec…
 9 Mir378a  GSM2545338      7 CNS_… GSM2545338    Mus mus… 8 we… Fema… NonInfec…
10 Mir133b  GSM2545338      6 CNS_… GSM2545338    Mus mus… 8 we… Fema… NonInfec…
# … with 39 more rows, and 21 more variables: strain <chr>, time <fct>,
#   tissue <fct>, mouse <fct>, center <chr>, gene <chr>, ENTREZID <chr>,
#   product <chr>, gbkey <chr>, external_gene_name <chr>,
#   ensembl_gene_id <chr>, external_synonym <chr>, chromosome_name <chr>,
#   gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>, seqnames <fct>, start <int>,
#   end <int>, width <int>, strand <fct>

se %>% 
  filter(time == "Day0" | time == "Day8") %>% 
  group_by(.sample) %>% 
  slice(1:3)

tidySummarizedExperiment says: A data frame is returned for independent data analysis.

# A tibble: 42 × 30
# Groups:   .sample [14]
   .feature .sample    counts title geo_accession organism age   sex   infection
   <chr>    <chr>       <int> <chr> <chr>         <chr>    <chr> <fct> <fct>    
 1 Asl      GSM2545336   1170 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 2 Apod     GSM2545336  36194 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 3 Cyp2d22  GSM2545336   4060 CNS_… GSM2545336    Mus mus… 8 we… Fema… Influenz…
 4 Asl      GSM2545337    361 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 5 Apod     GSM2545337  10347 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 6 Cyp2d22  GSM2545337   1616 CNS_… GSM2545337    Mus mus… 8 we… Fema… NonInfec…
 7 Asl      GSM2545338    400 CNS_… GSM2545338    Mus mus… 8 we… Fema… NonInfec…
 8 Apod     GSM2545338   9173 CNS_… GSM2545338    Mus mus… 8 we… Fema… NonInfec…
 9 Cyp2d22  GSM2545338   1603 CNS_… GSM2545338    Mus mus… 8 we… Fema… NonInfec…
10 Asl      GSM2545341    988 CNS_… GSM2545341    Mus mus… 8 we… Male  Influenz…
# … with 32 more rows, and 21 more variables: strain <chr>, time <fct>,
#   tissue <fct>, mouse <fct>, center <chr>, gene <chr>, ENTREZID <chr>,
#   product <chr>, gbkey <chr>, external_gene_name <chr>,
#   ensembl_gene_id <chr>, external_synonym <chr>, chromosome_name <chr>,
#   gene_biotype <chr>, phenotype_description <chr>,
#   hsapiens_homolog_associated_gene_name <chr>, seqnames <fct>, start <int>,
#   end <int>, width <int>, strand <fct>

se %>%
  group_by(.sample) %>%
  summarise(total_counts=sum(counts)) %>%
  ggplot(aes(x = .sample, y = total_counts)) +
  geom_col() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,  hjust = 0.5, vjust = 0.5))

tidySummarizedExperiment says: A data frame is returned for independent data analysis.

# We won't use here but to install and use 
# BiocManager::install('iSEE')
# library(iSEE)
# iSEE(se)

browseVignettes(package = "SummarizedExperiment")

# These two statements will result in the same output.

se@colData

DataFrame with 22 rows and 10 columns
                     title geo_accession     organism         age      sex
               <character>   <character>  <character> <character> <factor>
GSM2545336 CNS_RNA-seq_10C    GSM2545336 Mus musculus     8 weeks   Female
GSM2545337 CNS_RNA-seq_11C    GSM2545337 Mus musculus     8 weeks   Female
GSM2545338 CNS_RNA-seq_12C    GSM2545338 Mus musculus     8 weeks   Female
GSM2545339 CNS_RNA-seq_13C    GSM2545339 Mus musculus     8 weeks   Female
GSM2545340 CNS_RNA-seq_14C    GSM2545340 Mus musculus     8 weeks   Male  
...                    ...           ...          ...         ...      ...
GSM2545353  CNS_RNA-seq_3C    GSM2545353 Mus musculus     8 weeks   Female
GSM2545354  CNS_RNA-seq_4C    GSM2545354 Mus musculus     8 weeks   Male  
GSM2545362  CNS_RNA-seq_5C    GSM2545362 Mus musculus     8 weeks   Female
GSM2545363  CNS_RNA-seq_6C    GSM2545363 Mus musculus     8 weeks   Male  
GSM2545380  CNS_RNA-seq_9C    GSM2545380 Mus musculus     8 weeks   Female
             infection      strain     time     tissue    mouse
              <factor> <character> <factor>   <factor> <factor>
GSM2545336 InfluenzaA      C57BL/6     Day8 Cerebellum       14
GSM2545337 NonInfected     C57BL/6     Day0 Cerebellum       9 
GSM2545338 NonInfected     C57BL/6     Day0 Cerebellum       10
GSM2545339 InfluenzaA      C57BL/6     Day4 Cerebellum       15
GSM2545340 InfluenzaA      C57BL/6     Day4 Cerebellum       18
...                ...         ...      ...        ...      ...
GSM2545353 NonInfected     C57BL/6     Day0 Cerebellum       4 
GSM2545354 NonInfected     C57BL/6     Day0 Cerebellum       2 
GSM2545362 InfluenzaA      C57BL/6     Day4 Cerebellum       20
GSM2545363 InfluenzaA      C57BL/6     Day4 Cerebellum       12
GSM2545380 InfluenzaA      C57BL/6     Day8 Cerebellum       19

colData(se) #colData is a method that allows access to the column meta data

DataFrame with 22 rows and 10 columns
                     title geo_accession     organism         age      sex
               <character>   <character>  <character> <character> <factor>
GSM2545336 CNS_RNA-seq_10C    GSM2545336 Mus musculus     8 weeks   Female
GSM2545337 CNS_RNA-seq_11C    GSM2545337 Mus musculus     8 weeks   Female
GSM2545338 CNS_RNA-seq_12C    GSM2545338 Mus musculus     8 weeks   Female
GSM2545339 CNS_RNA-seq_13C    GSM2545339 Mus musculus     8 weeks   Female
GSM2545340 CNS_RNA-seq_14C    GSM2545340 Mus musculus     8 weeks   Male  
...                    ...           ...          ...         ...      ...
GSM2545353  CNS_RNA-seq_3C    GSM2545353 Mus musculus     8 weeks   Female
GSM2545354  CNS_RNA-seq_4C    GSM2545354 Mus musculus     8 weeks   Male  
GSM2545362  CNS_RNA-seq_5C    GSM2545362 Mus musculus     8 weeks   Female
GSM2545363  CNS_RNA-seq_6C    GSM2545363 Mus musculus     8 weeks   Male  
GSM2545380  CNS_RNA-seq_9C    GSM2545380 Mus musculus     8 weeks   Female
             infection      strain     time     tissue    mouse
              <factor> <character> <factor>   <factor> <factor>
GSM2545336 InfluenzaA      C57BL/6     Day8 Cerebellum       14
GSM2545337 NonInfected     C57BL/6     Day0 Cerebellum       9 
GSM2545338 NonInfected     C57BL/6     Day0 Cerebellum       10
GSM2545339 InfluenzaA      C57BL/6     Day4 Cerebellum       15
GSM2545340 InfluenzaA      C57BL/6     Day4 Cerebellum       18
...                ...         ...      ...        ...      ...
GSM2545353 NonInfected     C57BL/6     Day0 Cerebellum       4 
GSM2545354 NonInfected     C57BL/6     Day0 Cerebellum       2 
GSM2545362 InfluenzaA      C57BL/6     Day4 Cerebellum       20
GSM2545363 InfluenzaA      C57BL/6     Day4 Cerebellum       12
GSM2545380 InfluenzaA      C57BL/6     Day8 Cerebellum       19

suppressPackageStartupMessages({
library(hgu95av2.db)
})

ls("package:hgu95av2.db")

 [1] "hgu95av2"              "hgu95av2_dbconn"       "hgu95av2_dbfile"      
 [4] "hgu95av2_dbInfo"       "hgu95av2_dbschema"     "hgu95av2.db"          
 [7] "hgu95av2ACCNUM"        "hgu95av2ALIAS2PROBE"   "hgu95av2CHR"          
[10] "hgu95av2CHRLENGTHS"    "hgu95av2CHRLOC"        "hgu95av2CHRLOCEND"    
[13] "hgu95av2ENSEMBL"       "hgu95av2ENSEMBL2PROBE" "hgu95av2ENTREZID"     
[16] "hgu95av2ENZYME"        "hgu95av2ENZYME2PROBE"  "hgu95av2GENENAME"     
[19] "hgu95av2GO"            "hgu95av2GO2ALLPROBES"  "hgu95av2GO2PROBE"     
[22] "hgu95av2MAP"           "hgu95av2MAPCOUNTS"     "hgu95av2OMIM"         
[25] "hgu95av2ORGANISM"      "hgu95av2ORGPKG"        "hgu95av2PATH"         
[28] "hgu95av2PATH2PROBE"    "hgu95av2PFAM"          "hgu95av2PMID"         
[31] "hgu95av2PMID2PROBE"    "hgu95av2PROSITE"       "hgu95av2REFSEQ"       
[34] "hgu95av2SYMBOL"        "hgu95av2UNIPROT"      

hgu95av2.db

ChipDb object:
| DBSCHEMAVERSION: 2.1
| Db type: ChipDb
| Supporting package: AnnotationDbi
| DBSCHEMA: HUMANCHIP_DB
| ORGANISM: Homo sapiens
| SPECIES: Human
| MANUFACTURER: Affymetrix
| CHIPNAME: Affymetrix HG_U95Av2 Array
| MANUFACTURERURL: http://www.affymetrix.com
| EGSOURCEDATE: 2021-Apr14
| EGSOURCENAME: Entrez Gene
| EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
| CENTRALID: ENTREZID
| TAXID: 9606
| GOSOURCENAME: Gene Ontology
| GOSOURCEURL: http://current.geneontology.org/ontology/go-basic.obo
| GOSOURCEDATE: 2021-02-01
| GOEGSOURCEDATE: 2021-Apr14
| GOEGSOURCENAME: Entrez Gene
| GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
| KEGGSOURCENAME: KEGG GENOME
| KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
| KEGGSOURCEDATE: 2011-Mar15
| GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
| GPSOURCEURL: 
| GPSOURCEDATE: 2021-Feb16
| ENSOURCEDATE: 2021-Feb16
| ENSOURCENAME: Ensembl
| ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
| UPSOURCENAME: Uniprot
| UPSOURCEURL: http://www.UniProt.org/
| UPSOURCEDATE: Mon Apr 26 21:53:12 2021

Please see: help('select') for usage information

columns(hgu95av2.db)

 [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
 [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
[11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
[16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
[21] "PMID"         "PROBEID"      "PROSITE"      "REFSEQ"       "SYMBOL"      
[26] "UCSCKG"       "UNIPROT"     

help("SYMBOL")

keytypes(hgu95av2.db)

 [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
 [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
[11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
[16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
[21] "PMID"         "PROBEID"      "PROSITE"      "REFSEQ"       "SYMBOL"      
[26] "UCSCKG"       "UNIPROT"     

head(keys(hgu95av2.db, keytype="SYMBOL"))

[1] "A1BG"  "A2M"   "A2MP1" "NAT1"  "NAT2"  "NATP" 

#1st get some example keys
k <- head(keys(hgu95av2.db,keytype="PROBEID"))
# then call select
select(hgu95av2.db, keys=k, columns=c("SYMBOL","GENENAME"), keytype="PROBEID")

'select()' returned 1:1 mapping between keys and columns

    PROBEID  SYMBOL
1   1000_at   MAPK3
2   1001_at    TIE1
3 1002_f_at CYP2C19
4 1003_s_at   CXCR5
5   1004_at   CXCR5
6   1005_at   DUSP1
                                                         GENENAME
1                              mitogen-activated protein kinase 3
2 tyrosine kinase with immunoglobulin like and EGF like domains 1
3                  cytochrome P450 family 2 subfamily C member 19
4                                C-X-C motif chemokine receptor 5
5                                C-X-C motif chemokine receptor 5
6                                  dual specificity phosphatase 1

library(org.Hs.eg.db)
uniKeys <- head(keys(org.Hs.eg.db, keytype="UNIPROT"))
cols <- c("SYMBOL", "PATH")
select(org.Hs.eg.db, keys=uniKeys, columns=cols, keytype="UNIPROT")

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
keys <- head(keys(txdb, keytype="GENEID"))
cols <- c("TXID", "TXSTART")
select(txdb, keys=keys, columns=cols, keytype="GENEID")

Good Name	Good Alternative	Avoid
Max_temp_C	MaxTemp	Maximum Temp (°C)
Precipitation_mm	Precipitation	precmm
Mean_year_growth	MeanYearGrowth	Mean growth/year
sex	sex	M/F
weight	weight	w.
cell_type	CellType	Cell Type
Observation_01	first_observation	1st Obs

Column	Description
gene	The name of the gene that was measured
sample	The name of the sample the gene expression was measured in
expression	The value of the gene expression
organism	The organism/species - here all data stem from mice
age	The age of the mouse (all mice were 8 weeks here)
sex	The sex of the mouse
infection	The infection state of the mouse, i.e. infected with Influenza A or not infected.
strain	The Influenza A strain; C57BL/6 in all cases.
time	The duration of the infection (in days).
tissue	The tissue that was used for the gene expression experiment, i.e. cerebellum or spinal cord.
mouse	The mouse unique identifier.

Introduction to data analysis with R and Bioconductor

Data organisation with Spreadsheets

Overview

Spreadsheet programs

Spreadsheet outline

What this lesson will not teach you

Why aren’t we teaching data analysis in spreadsheets

Problems with Spreadsheets

Using Spreadsheets for Data Entry and Cleaning

Formatting data tables in spreadsheets

Keeping track of your analyses

Structuring data in spreadsheets

Challenge: We’re going to take a messy data and describe how we would clean it up.

Challenge: Once you have tidied up the data, answer the following questions:

Common Spreadsheet Errors

Using multiple tables

Using multiple tabs

Not filling in zeros

Using problematic null values

Using formatting to convey information

Using formatting to make the data sheet look pretty

Placing comments or units in cells

Entering more than one piece of information in a cell

Using problematic field names

Using special characters in data

Inclusion of metadata in data table

Exporting data

Caveats on commas

Summary

Key Points

R and RStudio

Overview

What is R? What is RStudio?

Why learn R?

R does not involve lots of pointing and clicking, and that’s a good thing

R code is great for reproducibility

R is interdisciplinary and extensible

R works on data of all shapes and sizes

R produces high-quality graphics

R has a large and welcoming community

Not only is R free, but it is also open-source and cross-platform

Knowing your way around RStudio

Getting set up

Organizing your working directory

Challenge: create your project directory structure

The working directory

Interacting with R

How to learn more during and after the course?

Seeking help

Use the built-in RStudio help interface to search for more information on R functions

I know the name of the function I want to use, but I’m not sure how to use it

I want to use a function that does X, there must be a function for it but I don’t know which one…

I am stuck… I get an error message that I don’t understand

Asking for help

Where to ask for help?

More resources

R packages

Loading packages

Installing packages

Key Points

Introduction to R

Overview

Creating objects in R

Naming variables {-}

Challenge:

Comments

Challenge

Functions and their arguments

Vectors and data types

Challenge:

Solution

Challenge:

Solution

Challenge:

Solution

Homework Challenge:

Homework Challenge:

Solution

Subsetting vectors

Conditional subsetting

Inspecting `data.frame` Objects

Data Manipulation using `dplyr` and `tidyr`

The `summarize()` function