Clear the global environment
rm(list = ls())
ls()
## character(0)
Check the working directory
getwd()
Set your working directory
setwd("/Users/allison/Desktop/R_workshop")
Read the data in from a csv file
Before generating and saving a file for use in R, check that you’ve prepared the data (ie no spaces in column headers)
count_table <- read.csv("GSE62902.csv", row.names = 1)
Does count_table have the column headers that we would expect?
Note that the first line includes sample dosages. This is followed by gene count data.
head(count_table)
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921
## CLASS:DOSE 0 0 0 0 0
## ENSMUSG00000000001 2289 2171 2124 2538 2397
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 21 10 22 23 14
## ENSMUSG00000000031 7 8 5 8 7
## ENSMUSG00000000037 2 5 4 0 1
## GSM1535922 GSM1535923 GSM1535924 GSM1535925 GSM1535926
## CLASS:DOSE 0.01 0.01 0.01 0.01 0.01
## ENSMUSG00000000001 4707.00 2797.00 3235.00 1955.00 5983.00
## ENSMUSG00000000003 0.00 0.00 0.00 0.00 0.00
## ENSMUSG00000000028 39.00 10.00 21.00 13.00 115.00
## ENSMUSG00000000031 18.00 3.00 102.00 8.00 51.00
## ENSMUSG00000000037 7.00 0.00 7.00 3.00 4.00
## GSM1535927 GSM1535928 GSM1535929 GSM1535930 GSM1535931
## CLASS:DOSE 0.03 0.03 0.03 0.03 0.03
## ENSMUSG00000000001 1918.00 3639.00 1932.00 2108.00 1988.00
## ENSMUSG00000000003 0.00 0.00 0.00 0.00 0.00
## ENSMUSG00000000028 10.00 38.00 10.00 25.00 15.00
## ENSMUSG00000000031 8.00 7.00 5.00 6.00 5.00
## ENSMUSG00000000037 0.00 5.00 1.00 4.00 5.00
## GSM1535932 GSM1535933 GSM1535934 GSM1535935 GSM1535936
## CLASS:DOSE 0.1 0.1 0.1 0.1 0.1
## ENSMUSG00000000001 1792.0 2553.0 2318.0 3179.0 6153.0
## ENSMUSG00000000003 0.0 0.0 0.0 0.0 0.0
## ENSMUSG00000000028 13.0 11.0 9.0 23.0 68.0
## ENSMUSG00000000031 5.0 5.0 29.0 12.0 30.0
## ENSMUSG00000000037 0.0 2.0 1.0 4.0 7.0
## GSM1535937 GSM1535938 GSM1535939 GSM1535940 GSM1535941
## CLASS:DOSE 0.3 0.3 0.3 0.3 0.3
## ENSMUSG00000000001 1899.0 3351.0 3552.0 2201.0 2607.0
## ENSMUSG00000000003 0.0 0.0 0.0 0.0 0.0
## ENSMUSG00000000028 17.0 17.0 15.0 10.0 24.0
## ENSMUSG00000000031 3.0 5.0 28.0 4.0 9.0
## ENSMUSG00000000037 0.0 1.0 1.0 0.0 3.0
## GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## CLASS:DOSE 1 1 1 1 1
## ENSMUSG00000000001 1867 5080 1886 2232 2425
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 13 38 13 15 18
## ENSMUSG00000000031 4 15 5 12 6
## ENSMUSG00000000037 0 9 2 5 2
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951
## CLASS:DOSE 3 3 3 3 3
## ENSMUSG00000000001 2054 7355 4184 2303 4723
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 17 66 22 21 44
## ENSMUSG00000000031 9 8 8 5 15
## ENSMUSG00000000037 4 0 4 0 2
## GSM1535952 GSM1535953 GSM1535954 GSM1535955 GSM1535956
## CLASS:DOSE 10 10 10 10 10
## ENSMUSG00000000001 4957 2775 2330 3296 4234
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 41 24 13 34 49
## ENSMUSG00000000031 2 2 4 19 17
## ENSMUSG00000000037 4 1 0 4 1
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## CLASS:DOSE 30 30 30 30 30
## ENSMUSG00000000001 3892 2492 3946 2998 1647
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 28 26 42 27 27
## ENSMUSG00000000031 3 5 7 4 4
## ENSMUSG00000000037 1 6 7 0 3
Does count_table have the dimensions that we would expect?
Dimensions: number of observations (rows) and number of variables (columns)
dim(count_table)
## [1] 39185 45
Check the end of the file
Note that alignment data is displayed in the last 5 lines
tail(count_table)
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1881529 2139904 2267213 2133773 3056039
## ambiguous 268077 278079 300837 315918 336902
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 14393970 15681645 19793013 13586315 24741421
## GSM1535922 GSM1535923 GSM1535924 GSM1535925 GSM1535926
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 3826438 2211512 2882115 1909008 6072296
## ambiguous 620854 310604 388395 298379 1085602
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 30687082 17719778 15677024 17128247 46683508
## GSM1535927 GSM1535928 GSM1535929 GSM1535930 GSM1535931
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1990442 3347184 1670467 1983181 1850848
## ambiguous 282453 410756 242448 277864 221780
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 14290771 29441212 11623407 13246953 15561240
## GSM1535932 GSM1535933 GSM1535934 GSM1535935 GSM1535936
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1655690 2198800 2554621 4414797 7281279
## ambiguous 201846 292347 309782 346757 1035381
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 14108358 20777353 22232776 24170388 59345763
## GSM1535937 GSM1535938 GSM1535939 GSM1535940 GSM1535941
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1705077 2845113 3524726 1833957 2289010
## ambiguous 224102 453790 510255 228289 333476
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 18356821 23115670 26822489 17220541 18679634
## GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1713330 4488003 1955263 2038621 2274344
## ambiguous 241165 711036 256837 286070 335366
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 13771007 44241402 16048447 15476269 19474907
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1750551 4843348 3980377 2081716 4132162
## ambiguous 272524 844200 506803 282948 594368
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 13801317 45983595 30603154 15495996 32696230
## GSM1535952 GSM1535953 GSM1535954 GSM1535955 GSM1535956
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 3619972 2574902 2104853 2717125 3386082
## ambiguous 616838 359478 312609 472196 619067
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 26251445 16111737 14519451 18152232 19240502
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 2674043 1984283 3275230 2458641 1227618
## ambiguous 434983 346925 607267 381430 274526
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 12978842 12023315 20324243 16801245 8231611
Does count_table contain a data type that we would expect?
Get information about the structure of the data
class(count_table)
## [1] "data.frame"
str(count_table)
## 'data.frame': 39185 obs. of 45 variables:
## $ GSM1535917: int 0 2289 0 21 7 2 30854 1005 88 244 ...
## $ GSM1535918: int 0 2171 0 10 8 5 34567 1183 77 218 ...
## $ GSM1535919: int 0 2124 0 22 5 4 38467 1043 87 171 ...
## $ GSM1535920: int 0 2538 0 23 8 0 34859 1417 75 171 ...
## $ GSM1535921: int 0 2397 0 14 7 1 46227 1004 91 232 ...
## $ GSM1535922: num 0.01 4707 0 39 18 ...
## $ GSM1535923: num 0.01 2797 0 10 3 ...
## $ GSM1535924: num 0.01 3235 0 21 102 ...
## $ GSM1535925: num 0.01 1955 0 13 8 ...
## $ GSM1535926: num 0.01 5983 0 115 51 ...
## $ GSM1535927: num 0.03 1918 0 10 8 ...
## $ GSM1535928: num 0.03 3639 0 38 7 ...
## $ GSM1535929: num 0.03 1932 0 10 5 ...
## $ GSM1535930: num 0.03 2108 0 25 6 ...
## $ GSM1535931: num 0.03 1988 0 15 5 ...
## $ GSM1535932: num 0.1 1792 0 13 5 ...
## $ GSM1535933: num 0.1 2553 0 11 5 ...
## $ GSM1535934: num 0.1 2318 0 9 29 ...
## $ GSM1535935: num 0.1 3179 0 23 12 ...
## $ GSM1535936: num 0.1 6153 0 68 30 ...
## $ GSM1535937: num 0.3 1899 0 17 3 ...
## $ GSM1535938: num 0.3 3351 0 17 5 ...
## $ GSM1535939: num 0.3 3552 0 15 28 ...
## $ GSM1535940: num 0.3 2201 0 10 4 ...
## $ GSM1535941: num 0.3 2607 0 24 9 ...
## $ GSM1535942: int 1 1867 0 13 4 0 25177 1131 68 188 ...
## $ GSM1535943: int 1 5080 0 38 15 9 65724 1926 182 783 ...
## $ GSM1535944: int 1 1886 0 13 5 2 25666 1163 73 271 ...
## $ GSM1535945: int 1 2232 0 15 12 5 31012 1095 77 290 ...
## $ GSM1535946: int 1 2425 0 18 6 2 35724 1131 129 278 ...
## $ GSM1535947: int 3 2054 0 17 9 4 25174 999 136 367 ...
## $ GSM1535948: int 3 7355 0 66 8 0 92435 3405 300 726 ...
## $ GSM1535949: int 3 4184 0 22 8 4 51213 1437 288 382 ...
## $ GSM1535950: int 3 2303 0 21 5 0 29885 1129 129 303 ...
## $ GSM1535951: int 3 4723 0 44 15 2 51362 1991 283 478 ...
## $ GSM1535952: int 10 4957 0 41 2 4 52105 2601 225 461 ...
## $ GSM1535953: int 10 2775 0 24 2 1 38548 955 166 329 ...
## $ GSM1535954: int 10 2330 0 13 4 0 31631 1120 114 283 ...
## $ GSM1535955: int 10 3296 0 34 19 4 39873 1792 201 364 ...
## $ GSM1535956: int 10 4234 0 49 17 1 51206 2581 290 426 ...
## $ GSM1535957: int 30 3892 0 28 3 1 29703 1377 234 421 ...
## $ GSM1535958: int 30 2492 0 26 5 6 20184 1258 193 352 ...
## $ GSM1535959: int 30 3946 0 42 7 7 38755 1672 334 422 ...
## $ GSM1535960: int 30 2998 0 27 4 0 33372 1092 128 350 ...
## $ GSM1535961: int 30 1647 0 27 4 3 17373 679 97 189 ...
Does the data frame contain any NA values?
anyNA(count_table)
## [1] FALSE
Find the number of NA values in the data frame
sum(is.na(count_table))
## [1] 0
List rows that have NA
count_table[!complete.cases(count_table), ]
## [1] GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921 GSM1535922
## [7] GSM1535923 GSM1535924 GSM1535925 GSM1535926 GSM1535927 GSM1535928
## [13] GSM1535929 GSM1535930 GSM1535931 GSM1535932 GSM1535933 GSM1535934
## [19] GSM1535935 GSM1535936 GSM1535937 GSM1535938 GSM1535939 GSM1535940
## [25] GSM1535941 GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## [31] GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951 GSM1535952
## [37] GSM1535953 GSM1535954 GSM1535955 GSM1535956 GSM1535957 GSM1535958
## [43] GSM1535959 GSM1535960 GSM1535961
## <0 rows> (or 0-length row.names)
One can look at specific elements within a data frame
For example, what is the count for the gene in row 2, column 1
head(count_table, n = 3)
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921
## CLASS:DOSE 0 0 0 0 0
## ENSMUSG00000000001 2289 2171 2124 2538 2397
## ENSMUSG00000000003 0 0 0 0 0
## GSM1535922 GSM1535923 GSM1535924 GSM1535925 GSM1535926
## CLASS:DOSE 0.01 0.01 0.01 0.01 0.01
## ENSMUSG00000000001 4707.00 2797.00 3235.00 1955.00 5983.00
## ENSMUSG00000000003 0.00 0.00 0.00 0.00 0.00
## GSM1535927 GSM1535928 GSM1535929 GSM1535930 GSM1535931
## CLASS:DOSE 0.03 0.03 0.03 0.03 0.03
## ENSMUSG00000000001 1918.00 3639.00 1932.00 2108.00 1988.00
## ENSMUSG00000000003 0.00 0.00 0.00 0.00 0.00
## GSM1535932 GSM1535933 GSM1535934 GSM1535935 GSM1535936
## CLASS:DOSE 0.1 0.1 0.1 0.1 0.1
## ENSMUSG00000000001 1792.0 2553.0 2318.0 3179.0 6153.0
## ENSMUSG00000000003 0.0 0.0 0.0 0.0 0.0
## GSM1535937 GSM1535938 GSM1535939 GSM1535940 GSM1535941
## CLASS:DOSE 0.3 0.3 0.3 0.3 0.3
## ENSMUSG00000000001 1899.0 3351.0 3552.0 2201.0 2607.0
## ENSMUSG00000000003 0.0 0.0 0.0 0.0 0.0
## GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## CLASS:DOSE 1 1 1 1 1
## ENSMUSG00000000001 1867 5080 1886 2232 2425
## ENSMUSG00000000003 0 0 0 0 0
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951
## CLASS:DOSE 3 3 3 3 3
## ENSMUSG00000000001 2054 7355 4184 2303 4723
## ENSMUSG00000000003 0 0 0 0 0
## GSM1535952 GSM1535953 GSM1535954 GSM1535955 GSM1535956
## CLASS:DOSE 10 10 10 10 10
## ENSMUSG00000000001 4957 2775 2330 3296 4234
## ENSMUSG00000000003 0 0 0 0 0
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## CLASS:DOSE 30 30 30 30 30
## ENSMUSG00000000001 3892 2492 3946 2998 1647
## ENSMUSG00000000003 0 0 0 0 0
count_table[2, 1]
## [1] 2289
For rows 1:2
count_table[1:2, ]
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921
## CLASS:DOSE 0 0 0 0 0
## ENSMUSG00000000001 2289 2171 2124 2538 2397
## GSM1535922 GSM1535923 GSM1535924 GSM1535925 GSM1535926
## CLASS:DOSE 0.01 0.01 0.01 0.01 0.01
## ENSMUSG00000000001 4707.00 2797.00 3235.00 1955.00 5983.00
## GSM1535927 GSM1535928 GSM1535929 GSM1535930 GSM1535931
## CLASS:DOSE 0.03 0.03 0.03 0.03 0.03
## ENSMUSG00000000001 1918.00 3639.00 1932.00 2108.00 1988.00
## GSM1535932 GSM1535933 GSM1535934 GSM1535935 GSM1535936
## CLASS:DOSE 0.1 0.1 0.1 0.1 0.1
## ENSMUSG00000000001 1792.0 2553.0 2318.0 3179.0 6153.0
## GSM1535937 GSM1535938 GSM1535939 GSM1535940 GSM1535941
## CLASS:DOSE 0.3 0.3 0.3 0.3 0.3
## ENSMUSG00000000001 1899.0 3351.0 3552.0 2201.0 2607.0
## GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## CLASS:DOSE 1 1 1 1 1
## ENSMUSG00000000001 1867 5080 1886 2232 2425
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951
## CLASS:DOSE 3 3 3 3 3
## ENSMUSG00000000001 2054 7355 4184 2303 4723
## GSM1535952 GSM1535953 GSM1535954 GSM1535955 GSM1535956
## CLASS:DOSE 10 10 10 10 10
## ENSMUSG00000000001 4957 2775 2330 3296 4234
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## CLASS:DOSE 30 30 30 30 30
## ENSMUSG00000000001 3892 2492 3946 2998 1647
ex1 <- count_table[, c(1, 3)]
names(ex1)
## [1] "GSM1535917" "GSM1535919"
ex2 <- count_table[c(1, 3)]
names(ex2)
## [1] "GSM1535917" "GSM1535919"
ex3 <- count_table[c("GSM1535917", "GSM1535919")]
names(ex3)
## [1] "GSM1535917" "GSM1535919"
Column 1, v1
ex4 <- count_table[1]
head(ex4)
## GSM1535917
## CLASS:DOSE 0
## ENSMUSG00000000001 2289
## ENSMUSG00000000003 0
## ENSMUSG00000000028 21
## ENSMUSG00000000031 7
## ENSMUSG00000000037 2
Column 1, v2
ex5 <- count_table[, 1]
head(ex5)
## [1] 0 2289 0 21 7 2
Column 1, v2. Add argument drop=FALSE
ex6 <- count_table[, 1, drop = FALSE]
head(ex6)
## GSM1535917
## CLASS:DOSE 0
## ENSMUSG00000000001 2289
## ENSMUSG00000000003 0
## ENSMUSG00000000028 21
## ENSMUSG00000000031 7
## ENSMUSG00000000037 2
Examine the end of the file where there are 5 rows of alignment data
tail(count_table)
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1881529 2139904 2267213 2133773 3056039
## ambiguous 268077 278079 300837 315918 336902
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 14393970 15681645 19793013 13586315 24741421
## GSM1535922 GSM1535923 GSM1535924 GSM1535925 GSM1535926
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 3826438 2211512 2882115 1909008 6072296
## ambiguous 620854 310604 388395 298379 1085602
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 30687082 17719778 15677024 17128247 46683508
## GSM1535927 GSM1535928 GSM1535929 GSM1535930 GSM1535931
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1990442 3347184 1670467 1983181 1850848
## ambiguous 282453 410756 242448 277864 221780
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 14290771 29441212 11623407 13246953 15561240
## GSM1535932 GSM1535933 GSM1535934 GSM1535935 GSM1535936
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1655690 2198800 2554621 4414797 7281279
## ambiguous 201846 292347 309782 346757 1035381
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 14108358 20777353 22232776 24170388 59345763
## GSM1535937 GSM1535938 GSM1535939 GSM1535940 GSM1535941
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1705077 2845113 3524726 1833957 2289010
## ambiguous 224102 453790 510255 228289 333476
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 18356821 23115670 26822489 17220541 18679634
## GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1713330 4488003 1955263 2038621 2274344
## ambiguous 241165 711036 256837 286070 335366
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 13771007 44241402 16048447 15476269 19474907
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 1750551 4843348 3980377 2081716 4132162
## ambiguous 272524 844200 506803 282948 594368
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 13801317 45983595 30603154 15495996 32696230
## GSM1535952 GSM1535953 GSM1535954 GSM1535955 GSM1535956
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 3619972 2574902 2104853 2717125 3386082
## ambiguous 616838 359478 312609 472196 619067
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 26251445 16111737 14519451 18152232 19240502
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## ENSMUSG00000099334 0 0 0 0 0
## no_feature 2674043 1984283 3275230 2458641 1227618
## ambiguous 434983 346925 607267 381430 274526
## too_low_aQual 0 0 0 0 0
## not_aligned 0 0 0 0 0
## alignment_not_unique 12978842 12023315 20324243 16801245 8231611
Make a subset of the data to include just the alignment data by specifying the rows
One way to find the row number for the start of the alignment data is with the which function
which(row.names(count_table) == "no_feature")
## [1] 39181
Get the row number for the last row
nrow(count_table)
## [1] 39185
Make a subset of the alignment data by specifying these rows
alignment_subset <- count_table[39181:39185, ]
Check that the subset has the dimensions that we would expect (5 rows and 45 columns)
dim(alignment_subset)
## [1] 5 45
We are not going to use this alignment data for now. Let’s remove it from the count table
count_table <- count_table[-(39181:39185), ]
dim(count_table)
## [1] 39180 45
Perhaps though we’d like to use it somewhere else later. Let’s save it to a text file.
Help options: use write.table() with tab to show arguments, help() and help.search() functions
Alternative - saving objects, workspace
write.table(alignment_subset, "Sample_alignment_data.txt", sep = "\t", quote = FALSE)
Let’s take another look at the top of the file
head(count_table, n = 3)
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921
## CLASS:DOSE 0 0 0 0 0
## ENSMUSG00000000001 2289 2171 2124 2538 2397
## ENSMUSG00000000003 0 0 0 0 0
## GSM1535922 GSM1535923 GSM1535924 GSM1535925 GSM1535926
## CLASS:DOSE 0.01 0.01 0.01 0.01 0.01
## ENSMUSG00000000001 4707.00 2797.00 3235.00 1955.00 5983.00
## ENSMUSG00000000003 0.00 0.00 0.00 0.00 0.00
## GSM1535927 GSM1535928 GSM1535929 GSM1535930 GSM1535931
## CLASS:DOSE 0.03 0.03 0.03 0.03 0.03
## ENSMUSG00000000001 1918.00 3639.00 1932.00 2108.00 1988.00
## ENSMUSG00000000003 0.00 0.00 0.00 0.00 0.00
## GSM1535932 GSM1535933 GSM1535934 GSM1535935 GSM1535936
## CLASS:DOSE 0.1 0.1 0.1 0.1 0.1
## ENSMUSG00000000001 1792.0 2553.0 2318.0 3179.0 6153.0
## ENSMUSG00000000003 0.0 0.0 0.0 0.0 0.0
## GSM1535937 GSM1535938 GSM1535939 GSM1535940 GSM1535941
## CLASS:DOSE 0.3 0.3 0.3 0.3 0.3
## ENSMUSG00000000001 1899.0 3351.0 3552.0 2201.0 2607.0
## ENSMUSG00000000003 0.0 0.0 0.0 0.0 0.0
## GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## CLASS:DOSE 1 1 1 1 1
## ENSMUSG00000000001 1867 5080 1886 2232 2425
## ENSMUSG00000000003 0 0 0 0 0
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951
## CLASS:DOSE 3 3 3 3 3
## ENSMUSG00000000001 2054 7355 4184 2303 4723
## ENSMUSG00000000003 0 0 0 0 0
## GSM1535952 GSM1535953 GSM1535954 GSM1535955 GSM1535956
## CLASS:DOSE 10 10 10 10 10
## ENSMUSG00000000001 4957 2775 2330 3296 4234
## ENSMUSG00000000003 0 0 0 0 0
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## CLASS:DOSE 30 30 30 30 30
## ENSMUSG00000000001 3892 2492 3946 2998 1647
## ENSMUSG00000000003 0 0 0 0 0
The dosages for the samples are in the first row.
We can use this information to extract samples that were treated with specific doses.
Let’s save the dosage information to a new variable dosage
Use () to save and view dosage
(dosage <- count_table[1, ])
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921 GSM1535922
## CLASS:DOSE 0 0 0 0 0 0.01
## GSM1535923 GSM1535924 GSM1535925 GSM1535926 GSM1535927 GSM1535928
## CLASS:DOSE 0.01 0.01 0.01 0.01 0.03 0.03
## GSM1535929 GSM1535930 GSM1535931 GSM1535932 GSM1535933 GSM1535934
## CLASS:DOSE 0.03 0.03 0.03 0.1 0.1 0.1
## GSM1535935 GSM1535936 GSM1535937 GSM1535938 GSM1535939 GSM1535940
## CLASS:DOSE 0.1 0.1 0.3 0.3 0.3 0.3
## GSM1535941 GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## CLASS:DOSE 0.3 1 1 1 1 1
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951 GSM1535952
## CLASS:DOSE 3 3 3 3 3 10
## GSM1535953 GSM1535954 GSM1535955 GSM1535956 GSM1535957 GSM1535958
## CLASS:DOSE 10 10 10 10 30 30
## GSM1535959 GSM1535960 GSM1535961
## CLASS:DOSE 30 30 30
Let’s look at the max and min dosages observed in this dataset
max(dosage)
## [1] 30
min(dosage)
## [1] 0
How many samples have a dose greater than 0?
length(dosage[dosage > 0])
## [1] 40
One can use the which function to return the indices for those samples that pass this criteria
which(dosage > 0)
## [1] 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## [26] 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
The samples with these indices can be included in a new subset
ex7 <- count_table[which(dosage > 0)]
dim(ex7)
## [1] 39180 40
Remove the dosage information from count_table
count_table <- count_table[-1, ]
count_table now includes only the gene counts (39,179 genes, 45 samples)
head(count_table, n = 3)
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921
## ENSMUSG00000000001 2289 2171 2124 2538 2397
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 21 10 22 23 14
## GSM1535922 GSM1535923 GSM1535924 GSM1535925 GSM1535926
## ENSMUSG00000000001 4707 2797 3235 1955 5983
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 39 10 21 13 115
## GSM1535927 GSM1535928 GSM1535929 GSM1535930 GSM1535931
## ENSMUSG00000000001 1918 3639 1932 2108 1988
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 10 38 10 25 15
## GSM1535932 GSM1535933 GSM1535934 GSM1535935 GSM1535936
## ENSMUSG00000000001 1792 2553 2318 3179 6153
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 13 11 9 23 68
## GSM1535937 GSM1535938 GSM1535939 GSM1535940 GSM1535941
## ENSMUSG00000000001 1899 3351 3552 2201 2607
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 17 17 15 10 24
## GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## ENSMUSG00000000001 1867 5080 1886 2232 2425
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 13 38 13 15 18
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951
## ENSMUSG00000000001 2054 7355 4184 2303 4723
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 17 66 22 21 44
## GSM1535952 GSM1535953 GSM1535954 GSM1535955 GSM1535956
## ENSMUSG00000000001 4957 2775 2330 3296 4234
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 41 24 13 34 49
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## ENSMUSG00000000001 3892 2492 3946 2998 1647
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 28 26 42 27 27
tail(count_table, n = 3)
## GSM1535917 GSM1535918 GSM1535919 GSM1535920 GSM1535921
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
## GSM1535922 GSM1535923 GSM1535924 GSM1535925 GSM1535926
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
## GSM1535927 GSM1535928 GSM1535929 GSM1535930 GSM1535931
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
## GSM1535932 GSM1535933 GSM1535934 GSM1535935 GSM1535936
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
## GSM1535937 GSM1535938 GSM1535939 GSM1535940 GSM1535941
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
## GSM1535942 GSM1535943 GSM1535944 GSM1535945 GSM1535946
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
## GSM1535947 GSM1535948 GSM1535949 GSM1535950 GSM1535951
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
## GSM1535952 GSM1535953 GSM1535954 GSM1535955 GSM1535956
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## ENSMUSG00000099332 0 0 0 0 0
## ENSMUSG00000099333 0 0 0 0 0
## ENSMUSG00000099334 0 0 0 0 0
dim(count_table)
## [1] 39179 45
Let’s remove the alignment and example subsets from our environment
ls()
## [1] "alignment_subset" "count_table" "dosage" "ex1"
## [5] "ex2" "ex3" "ex4" "ex5"
## [9] "ex6" "ex7"
rm(alignment_subset, ex1, ex2, ex3, ex4, ex5, ex6, ex7)
ls()
## [1] "count_table" "dosage"
Here, we will look at the highest dosage, 30.
Use the which function to extract this data
counts_max_dosage <- count_table[which(dosage == 30)]
names(counts_max_dosage)
## [1] "GSM1535957" "GSM1535958" "GSM1535959" "GSM1535960" "GSM1535961"
Calculate the mean values for the counts for the 5 replicates at this dose
We will use the apply function to calculate the mean and will save it in a new column $D30_Mean
counts_max_dosage$D30_Mean <- apply(X = counts_max_dosage, MARGIN = 1, FUN = mean)
head(counts_max_dosage, n = 2)
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## ENSMUSG00000000001 3892 2492 3946 2998 1647
## ENSMUSG00000000003 0 0 0 0 0
## D30_Mean
## ENSMUSG00000000001 2995
## ENSMUSG00000000003 0
Sort the data frame based on D30_Mean (descending)
counts_max_dosage <- counts_max_dosage[order(-counts_max_dosage$D30_Mean), ]
head(counts_max_dosage)
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## ENSMUSG00000029368 2059002 1661675 2888826 1728760 1762623
## ENSMUSG00000032310 384256 272379 438091 332903 247789
## ENSMUSG00000064351 372392 227740 454665 278111 250970
## ENSMUSG00000064370 257128 197622 399601 246018 185116
## ENSMUSG00000002985 206007 152134 260532 180727 131441
## ENSMUSG00000064341 191441 137252 295935 163176 107644
## D30_Mean
## ENSMUSG00000029368 2020177.2
## ENSMUSG00000032310 335083.6
## ENSMUSG00000064351 316775.6
## ENSMUSG00000064370 257097.0
## ENSMUSG00000002985 186168.2
## ENSMUSG00000064341 179089.6
Add a new column D30_order that shows the row order
counts_max_dosage$D30_order <- seq.int(nrow(counts_max_dosage))
head(counts_max_dosage)
## GSM1535957 GSM1535958 GSM1535959 GSM1535960 GSM1535961
## ENSMUSG00000029368 2059002 1661675 2888826 1728760 1762623
## ENSMUSG00000032310 384256 272379 438091 332903 247789
## ENSMUSG00000064351 372392 227740 454665 278111 250970
## ENSMUSG00000064370 257128 197622 399601 246018 185116
## ENSMUSG00000002985 206007 152134 260532 180727 131441
## ENSMUSG00000064341 191441 137252 295935 163176 107644
## D30_Mean D30_order
## ENSMUSG00000029368 2020177.2 1
## ENSMUSG00000032310 335083.6 2
## ENSMUSG00000064351 316775.6 3
## ENSMUSG00000064370 257097.0 4
## ENSMUSG00000002985 186168.2 5
## ENSMUSG00000064341 179089.6 6
The Ensembl gene IDs are shown in the row names, let’s include them as a column
counts_max_dosage$gene_id <- row.names(counts_max_dosage)
Remove the 5 samples from the data frame
counts_max_dosage <- counts_max_dosage[-(1:5)]
names(counts_max_dosage)
## [1] "D30_Mean" "D30_order" "gene_id"
str(counts_max_dosage)
## 'data.frame': 39179 obs. of 3 variables:
## $ D30_Mean : num 2020177 335084 316776 257097 186168 ...
## $ D30_order: int 1 2 3 4 5 6 7 8 9 10 ...
## $ gene_id : chr "ENSMUSG00000029368" "ENSMUSG00000032310" "ENSMUSG00000064351" "ENSMUSG00000064370" ...
Perhaps we’d like to get more information about the top 10 genes in this subset.
We can retrieve annotations for the genes using the biomaRt package
head(counts_max_dosage, n = 3)
## D30_Mean D30_order gene_id
## ENSMUSG00000029368 2020177.2 1 ENSMUSG00000029368
## ENSMUSG00000032310 335083.6 2 ENSMUSG00000032310
## ENSMUSG00000064351 316775.6 3 ENSMUSG00000064351
Install biomaRt from bioconductor: https://bioconductor.org/packages/release/bioc/html/biomaRt.html
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("biomaRt")
Once the package is installed, load the package into our environment
library(biomaRt)
See the versions of loaded packages
sessionInfo()
We are going to use some additional packages later on in the workshop, so let’s install these packages
install.packages("ggplot2")
install.packages("plotly")
install.packages("ggthemes")
We will only annotate the top 10 genes here
Save the first 10 rows to a new data frame called top10
top10 <- counts_max_dosage[1:10, ]
We will use the mouse mm10 annotations
mm10 <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
Here we will add the mgi gene symbol and chromosome (getBM: https://www.rdocumentation.org/packages/biomaRt/versions/2.28.0/topics/getBM)
gene_annotation <- getBM(attributes = c("ensembl_gene_id", "mgi_symbol", "chromosome_name"), filters = "ensembl_gene_id", values = top10$gene_id, mart = mm10)
head(gene_annotation)
## ensembl_gene_id mgi_symbol chromosome_name
## 1 ENSMUSG00000002985 Apoe 7
## 2 ENSMUSG00000024164 C3 17
## 3 ENSMUSG00000025479 Cyp2e1 7
## 4 ENSMUSG00000029368 Alb 5
## 5 ENSMUSG00000032310 Cyp1a2 9
## 6 ENSMUSG00000032315 Cyp1a1 9
Merge the gene annotations with the top10 data frame
top10_annotated <- merge(top10, gene_annotation, by.x = "gene_id", by.y = "ensembl_gene_id")
head(top10_annotated)
## gene_id D30_Mean D30_order mgi_symbol chromosome_name
## 1 ENSMUSG00000002985 186168.2 5 Apoe 7
## 2 ENSMUSG00000024164 146606.8 9 C3 17
## 3 ENSMUSG00000025479 140608.2 10 Cyp2e1 7
## 4 ENSMUSG00000029368 2020177.2 1 Alb 5
## 5 ENSMUSG00000032310 335083.6 2 Cyp1a2 9
## 6 ENSMUSG00000032315 148246.2 8 Cyp1a1 9
Order the data frame based on D30_order (ascending)
Gene with highest count is Alb
top10_annotated <- top10_annotated[order(top10_annotated$D30_order), ]
head(top10_annotated)
## gene_id D30_Mean D30_order mgi_symbol chromosome_name
## 4 ENSMUSG00000029368 2020177.2 1 Alb 5
## 5 ENSMUSG00000032310 335083.6 2 Cyp1a2 9
## 9 ENSMUSG00000064351 316775.6 3 mt-Co1 MT
## 10 ENSMUSG00000064370 257097.0 4 mt-Cytb MT
## 1 ENSMUSG00000002985 186168.2 5 Apoe 7
## 7 ENSMUSG00000064341 179089.6 6 mt-Nd1 MT
We didn’t discuss categorical variables here but one needs to be careful if converting from factors to numeric
As an example, if we start with a numeric vector, say the doses that are greater than 1
dose_numeric <- dosage[dosage > 1]
dose_numeric
## [1] 3 3 3 3 3 10 10 10 10 10 30 30 30 30 30
str(dose_numeric)
## num [1:15] 3 3 3 3 3 10 10 10 10 10 ...
We then decide that we’d like these values to be factors
We can observe how R stores factors
dose_factor <- as.factor(dose_numeric)
str(dose_factor)
## Factor w/ 3 levels "3","10","30": 1 1 1 1 1 2 2 2 2 2 ...
If we’d like to reverse this conversion and revert to numeric values, we can see that we don’t get the original vector
as.numeric(dose_factor)
## [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
One way to get back to the original vector
as.numeric(as.character(dose_factor))
## [1] 3 3 3 3 3 10 10 10 10 10 30 30 30 30 30
Watch for “#” at the beginning of lines
The stringsAsFactors argument of data.frame() has a default value of FALSE starting with R 4.0.0 (documented in data.frame() help)