#Title: Honors Thesis R Script #Author: Woo Jeong ###Package Preparation library(DESeq2) library(annotables) library(RColorBrewer) library(ggrepel) library(annotables) library(dplyr) library(tidyverse) library(pheatmap) library(ggpubr) library(openxlsx) library(writexl) library(DEGreport) library(VennDiagram) library(org.Hs.eg.db) library(DOSE) library(pathview) library(clusterProfiler) library(AnnotationHub) library(ensembldb) library(tidyverse) ###Data Importation data1 <- read.delim("Day0_1.txt", header=TRUE, row.names=1) data2 <- read.delim("Day0_2.txt", header=TRUE, row.names=1) data3 <- read.delim("Day2_1.txt", header=TRUE, row.names=1) data4 <- read.delim("Day2_2.txt", header=TRUE, row.names=1) data5 <- read.delim("Day4_1.txt", header=TRUE, row.names=1) data6 <- read.delim("Day4_2.txt", header=TRUE, row.names=1) data7 <- read.delim("Day8_1.txt", header=TRUE, row.names=1) data8 <- read.delim("Day8_2.txt", header=TRUE, row.names=1) data9 <- read.delim("Day14.before.sorting_1.txt", header=TRUE, row.names=1) data10 <- read.delim("Day14.before.sorting_2.txt", header=TRUE, row.names=1) data11 <- read.delim("Day14.after.sorting_1.txt", header=TRUE, row.names=1) data12 <- read.delim("Day14.after.sorting_2.txt", header=TRUE, row.names=1) ###Raw Data & Metadata Generation #merge data by column bind (cbind) hpsc.ec.raw <- cbind(data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12) #name the columns of the raw data colnames(hpsc.ec.raw) <- c("D.0_1", "D.0_2", "D.2_1", "D.2_2", "D.4_1", "D.4_2", "D.8_1", "D.8_2", "D14.not.sorted_1", "D14.not.sorted_2", "D 14.sorted_1", "D14.sorted_2") #metadata timepoints <- factor(c("Day.0", "Day.0", "Day.2", "Day.2", "Day.4", "Day.4", "Day.8", "Day.8", "Day14.not.sorted", "Day14.not.sorted", "Day14.sorted", "Day14.sorted")) meta.condition <- data.frame(timepoints=as.factor(timepoints)) #name the rows of the metadata as the columns of the raw data (timepoints) rownames(meta.condition) <- colnames(hpsc.ec.raw) #check if row(meta)=col(raw)? all(rownames(meta.condition) == colnames(hpsc.ec.raw)) ###Normalization #DESeq2 Object Generation hpsc.ec.object <- DESeqDataSetFromMatrix(countData = hpsc.ec.raw, colData = meta.condition, design = ~timepoints) #normalization: calculation hpsc.ec.object <- estimateSizeFactors(hpsc.ec.object) sizeFactors(hpsc.ec.object) #normalization: extraction normalized_hpsc.ec.object_counts <- counts(hpsc.ec.object, normalized=TRUE) ###Fitting Raw Data to DESeq2 Generalized Linear Model ##Diagnostic 1: Mean-Variance Plot #calculating the mean for each gene (each row) mean_counts <- apply(hpsc.ec.raw[,1:12],1,mean) #calculating the variance for each gene (each row) variance_counts <- apply(hpsc.ec.raw[,1:12], 1, var) #Dispersion: creating data frame with mean and var for each gene df <- data.frame(mean_counts, variance_counts) #Dispersion: ggplot ggplot(df, aes(x=mean_counts, y=variance_counts)) + geom_point() + geom_abline(slope=1, intercept=0, color="blue") + geom_smooth(method="lm", formula=y~x, color="red") + scale_y_log10() + scale_x_log10() + xlab("Mean counts per gene") + ylab("Variance per gene") ##Diagnostic 2: Dispersion-Mean Plot #Dispersion Calculation hpsc.ec.object <- estimateDispersions(hpsc.ec.object) #Plot Dispersion Estimates plotDispEsts(hpsc.ec.object) ###Unsupervised Clustering Analysis ##Hierarchial Clustering Analysis #unsupervised clustering analysis: log transformation vsd_hpsc.ec <- vst(hpsc.ec.object, blind=TRUE) ##Hierarchical clustering with correlation heatmaps #extract the vst matrix from the object vsd_mat_hpsc.ec <- assay(vsd_hpsc.ec) #compute pairwise correlation values #showing correlation between each samples vsd_cor_hpsc.ec <- cor(vsd_mat_hpsc.ec) #plot heatmap pheatmap(vsd_cor_hpsc.ec, cluster_rows = F, cluster_cols = F) ##Principal Component Analysis (PCA) #plot PCA plotPCA(vsd_hpsc.ec, intgroup="timepoints") #PCA details plotPCA(vsd_hpsc.ec, intgroup="timepoints", returnData = TRUE) ###Differential Expression (DE) Analysis by Wald Test #full model hpsc.ec.object <- DESeqDataSetFromMatrix(countData = hpsc.ec.raw, colData = meta.condition, design = ~timepoints) ##Day0 vs. Day2 #contrast: Day0 vs. Day2 hpsc.ec.object_res_Day2_0 <- results(hpsc.ec.object, contrast = c("timepoints", "Day.2", "Day.0"), alpha = 0.05, lfcThreshold = 1) #summary mcols(hpsc.ec.object_res_Day2_0) summary(hpsc.ec.object_res_Day2_0) #MA plot plotMA(hpsc.ec.object_res_Day2_0, ylim = c(-8,8)) #shrink contrast hpsc.ec.object_res_Day2_0_shrink <- lfcShrink(hpsc.ec.object, contrast = c("timepoints", "Day.2", "Day.0"), res=hpsc.ec.object_res_Day2_0) #summary mcols(hpsc.ec.object_res_Day2_0_shrink) #shrink MA plot plotMA(hpsc.ec.object_res_Day2_0_shrink, ylim = c(-8,8), main="Day0 vs Day2") #DE genes summary summary(hpsc.ec.object_res_Day2_0_shrink) #DESeq2 shrink data frame hpsc.ec.object_result_Day2_0 <- data.frame(hpsc.ec.object_res_Day2_0_shrink) hpsc.ec.object_result_Day2_0$symbol <- rownames(hpsc.ec.object_result_Day2_0) hpsc.ec.object_result_Day2_0 <- left_join(x=hpsc.ec.object_result_Day2_0, y=grch38[,c("ensgene", "symbol", "description")], by = "symbol") #DESeq2 shrink DE genes hpsc.ec.object_result_Day2_0_sig_original <- subset(hpsc.ec.object_result_Day2_0, padj < 0.05) hpsc.ec.object_result_Day2_0_sig_original <- subset(hpsc.ec.object_result_Day2_0_sig_original, abs(log2FoldChange) > 1) #in order of padj hpsc.ec.object_result_Day2_0_sig <- hpsc.ec.object_result_Day2_0_sig_original %>% arrange(padj) #upregulated hpsc.ec.object_result_Day2_0_upregulated <- hpsc.ec.object_result_Day2_0_sig %>% filter(log2FoldChange > 0) %>% arrange(desc(log2FoldChange)) #downregulated hpsc.ec.object_result_Day2_0_downregulated <- hpsc.ec.object_result_Day2_0_sig %>% filter(log2FoldChange < 0) %>% arrange(desc(log2FoldChange)) #volcano data.frame from all results volcano_Day2_0 <- data.frame(hpsc.ec.object_result_Day2_0) %>% mutate(threshold = padj < 0.05) %>% arrange(padj) #volcano plot ggplot(volcano_Day2_0) + geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) + geom_text_repel(data=head(volcano_Day2_0, 50), aes(x=log2FoldChange, y=-log10(padj), label=symbol)) + xlab("log2 fold change") + ylab("-log10 adjusted p-value") + xlim(c(-10,10)) + ylim(c(0,200)) + ggtitle("Day0 vs Day2") + scale_color_manual(values=c("black", "red")) + theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5), axis.title = element_text(size = rel(1.25))) #make a color of our selection heat_colors <- brewer.pal(6, "YlOrRd") ###DESeq2 visualization - heatmap heatmap_general_Day2_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day2_0_sig$symbol,] pheatmap(heatmap_general_Day2_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day2", treeheight_row = 0, treeheight_col = 0, col = heat_colors, border_color= NA, fontsize = 10, fontsize_row = 10) #subset normalized counts to significant genes sig_norm_counts_hpsc.ec_Day2_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day2_0_sig$symbol,c(1:4)] #heatmap plot: general scheme pheatmap(sig_norm_counts_hpsc.ec_Day2_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day2", treeheight_row = 0, treeheight_col = 0, drop_levels = T) #heatmap: top20 genes by padj top_genes_Day2_0 <- sig_norm_counts_hpsc.ec_Day2_0[c(1:20),] pheatmap(top_genes_Day2_0, cluster_rows = T, cluster_cols = F, show_rownames = T, show_colnames = T, scale = "row", main = "Day0 vs Day2", treeheight_row = 0, treeheight_col = 0, fontsize_row = 11) #Expression plotL top20 genes by padj top20_Day2_0 <- data.frame(sig_norm_counts_hpsc.ec_Day2_0)[1:20,] %>% rownames_to_column(var = "symbol") top20_Day2_0 <- gather(top20_Day2_0, key = "samplename", value = "normalized_counts", 2:5) top20_Day2_0 <- inner_join(top20_Day2_0, rownames_to_column(meta.condition, var = "samplename"), by = "samplename") ggplot(top20_Day2_0) + geom_point(aes(x = symbol, y = normalized_counts, color = timepoints)) + scale_y_log10() + xlab("Genes") + ylab("Normalized Counts") + ggtitle("Top 20 Significant DE Genes: Day0 vs Day2") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(plot.title = element_text(hjust = 0.5)) ##DAY0 vs DAY4 #contrast:Day4&Day0 hpsc.ec.object_res_Day4_0 <- results(hpsc.ec.object, contrast = c("timepoints", "Day.4", "Day.0"), alpha = 0.05, lfcThreshold = 1) #summary mcols(hpsc.ec.object_res_Day4_0) summary(hpsc.ec.object_res_Day4_0) #MA plot plotMA(hpsc.ec.object_res_Day4_0, ylim = c(-8,8)) #shrink contrast:Day4&0 hpsc.ec.object_res_Day4_0_shrink <- lfcShrink(hpsc.ec.object, contrast = c("timepoints", "Day.4", "Day.0"), res=hpsc.ec.object_res_Day4_0) #summary mcols(hpsc.ec.object_res_Day4_0_shrink) #shrink MA plot plotMA(hpsc.ec.object_res_Day4_0_shrink, ylim = c(-8,8), main="Day0 vs Day4") #DE genes summary summary(hpsc.ec.object_res_Day4_0_shrink) #DESeq2 shrink data frame hpsc.ec.object_result_Day4_0 <- data.frame(hpsc.ec.object_res_Day4_0_shrink) hpsc.ec.object_result_Day4_0$symbol <- rownames(hpsc.ec.object_result_Day4_0) hpsc.ec.object_result_Day4_0 <- left_join(x=hpsc.ec.object_result_Day4_0, y=grch38[,c("ensgene", "symbol", "description")], by = "symbol") #DESeq2 shrink DE genes hpsc.ec.object_result_Day4_0_sig_original <- subset(hpsc.ec.object_result_Day4_0, padj < 0.05) hpsc.ec.object_result_Day4_0_sig_original <- subset(hpsc.ec.object_result_Day4_0_sig_original, abs(log2FoldChange) > 1) #in order of padj hpsc.ec.object_result_Day4_0_sig <- hpsc.ec.object_result_Day4_0_sig_original %>% arrange(padj) #upregulated hpsc.ec.object_result_Day4_0_upregulated <- hpsc.ec.object_result_Day4_0_sig %>% filter(log2FoldChange > 0) #downregulated hpsc.ec.object_result_Day4_0_downregulated <- hpsc.ec.object_result_Day4_0_sig %>% filter(log2FoldChange < 0) #volcano data.frame from all results volcano_Day4_0 <- data.frame(hpsc.ec.object_result_Day4_0) %>% mutate(threshold = padj < 0.05) %>% arrange(padj) #volcano plot pdf("Volcano Plot Day0 vs. Day4.pdf") ggplot(volcano_Day4_0) + geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) + geom_text_repel(data=head(volcano_Day4_0, 30), aes(x=log2FoldChange, y=-log10(padj), label=symbol)) + xlab("log2 fold change") + ylab("-log10 adjusted p-value") + xlim(c(-10,10)) + ylim(c(0,200)) + ggtitle("Day0 vs Day4") + scale_color_manual(values=c("black", "red")) + theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5), axis.title = element_text(size = rel(1.25))) #DESeq2 visualization - heatmap heatmap_general_Day4_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day4_0_sig$symbol,] pheatmap(heatmap_general_Day4_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day4", treeheight_row = 0, treeheight_col = 0, col = heat_colors) #subset normalized counts to significant genes sig_norm_counts_hpsc.ec_Day4_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day4_0_sig$symbol,c(1:2,5:6)] #heatmap plot: general scheme pheatmap(sig_norm_counts_hpsc.ec_Day4_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day4", treeheight_row = 0, treeheight_col = 0) #topgenes top_genes_Day4_0 <- sig_norm_counts_hpsc.ec_Day4_0[c(1:20),] pheatmap(top_genes_Day4_0, cluster_rows = T, cluster_cols = F, show_rownames = T, show_colnames = T, scale = "row", main = "Day0 vs Day4", treeheight_row = 0, treeheight_col = 0) #Expression plot top20_Day4_0 <- data.frame(sig_norm_counts_hpsc.ec_Day4_0)[1:20,] %>% rownames_to_column(var = "symbol") top20_Day4_0 <- gather(top20_Day4_0, key = "samplename", value = "normalized_counts", 2:5) top20_Day4_0 <- inner_join(top20_Day4_0, rownames_to_column(meta.condition, var = "samplename"), by = "samplename") ggplot(top20_Day4_0) + geom_point(aes(x = symbol, y = normalized_counts, color = timepoints)) + scale_y_log10() + xlab("Genes") + ylab("Normalized Counts") + ggtitle("Top 20 Significant DE Genes: Day0 vs Day4") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(plot.title = element_text(hjust = 0.5)) ##Day0 vs. Day8 #contrast:Day8&Day0 hpsc.ec.object_res_Day8_0 <- results(hpsc.ec.object, contrast = c("timepoints", "Day.8", "Day.0"), alpha = 0.05, lfcThreshold = 1) #summary mcols(hpsc.ec.object_res_Day8_0) summary(hpsc.ec.object_res_Day8_0) #MA plot plotMA(hpsc.ec.object_res_Day8_0, ylim = c(-8,8)) #shrink contrast:Day4&0 hpsc.ec.object_res_Day8_0_shrink <- lfcShrink(hpsc.ec.object, contrast = c("timepoints", "Day.8", "Day.0"), res=hpsc.ec.object_res_Day8_0) #summary mcols(hpsc.ec.object_res_Day8_0_shrink) summary(hpsc.ec.object_res_Day8_0_shrink) #shrink MA plot plotMA(hpsc.ec.object_res_Day8_0_shrink, ylim = c(-8,8), main="Day0 vs Day8") abline(h=c(-1, 1),col="dodgerblue",lwd=2) #DESeq2 shrink data frame hpsc.ec.object_result_Day8_0 <- data.frame(hpsc.ec.object_res_Day8_0_shrink) hpsc.ec.object_result_Day8_0$symbol <- rownames(hpsc.ec.object_result_Day8_0) hpsc.ec.object_result_Day8_0 <- left_join(x=hpsc.ec.object_result_Day8_0, y=grch38[,c("ensgene", "symbol", "description")], by = "symbol") #DESeq2 shrink DE genes hpsc.ec.object_result_Day8_0_sig_original <- subset(hpsc.ec.object_result_Day8_0, padj < 0.05) hpsc.ec.object_result_Day8_0_sig_original <- subset(hpsc.ec.object_result_Day8_0_sig_original, abs(log2FoldChange) > 1) #in order of padj hpsc.ec.object_result_Day8_0_sig <- hpsc.ec.object_result_Day8_0_sig_original %>% arrange(padj) #upregulated hpsc.ec.object_result_Day8_0_upregulated <- hpsc.ec.object_result_Day8_0_sig %>% filter(log2FoldChange > 0) #downregulated hpsc.ec.object_result_Day8_0_downregulated <- hpsc.ec.object_result_Day8_0_sig %>% filter(log2FoldChange < 0) #volcano data.frame from all results volcano_Day8_0 <- data.frame(hpsc.ec.object_result_Day8_0) %>% mutate(threshold = padj < 0.05) %>% arrange(padj) #volcano plot pdf("Volcano Plot Day0 vs. Day8.pdf") ggplot(volcano_Day8_0) + geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) + geom_text_repel(data=head(volcano_Day8_0, 50), aes(x=log2FoldChange, y=-log10(padj), label=symbol)) + xlab("log2 fold change") + ylab("-log10 adjusted p-value") + xlim(c(-10,10)) + ylim(c(0,200)) + ggtitle("Day0 vs Day8") + scale_color_manual(values=c("black", "red")) + theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5), axis.title = element_text(size = rel(1.25))) #heatmap heatmap_general_Day8_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day8_0_sig$symbol,] pheatmap(heatmap_general_Day8_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day8", treeheight_row = 0, treeheight_col = 0, col = heat_colors) #subset normalized counts to significant genes sig_norm_counts_hpsc.ec_Day8_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day8_0_sig$symbol,c(1:2,7:8)] #heatmap plot: general scheme pheatmap(sig_norm_counts_hpsc.ec_Day8_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day8", treeheight_row = 0, treeheight_col = 0) #topgenes top_genes_Day8_0 <- sig_norm_counts_hpsc.ec_Day8_0[c(1:50),] pheatmap(top_genes_Day8_0, cluster_rows = T, cluster_cols = F, show_rownames = T, show_colnames = T, scale = "row", main = "Day0 vs Day8", treeheight_row = 0, treeheight_col = 0) #Expression plot top20_Day8_0 <- data.frame(sig_norm_counts_hpsc.ec_Day8_0)[1:40,] %>% rownames_to_column(var = "symbol") top20_Day8_0 <- gather(top20_Day8_0, key = "samplename", value = "normalized_counts", 2:5) top20_Day8_0 <- inner_join(top20_Day8_0, rownames_to_column(meta.condition, var = "samplename"), by = "samplename") ggplot(top20_Day8_0) + geom_point(aes(x = symbol, y = normalized_counts, color = timepoints)) + scale_y_log10() + xlab("Genes") + ylab("Normalized Counts") + ggtitle("Top 20 Significant DE Genes: Day0 vs Day8") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(plot.title = element_text(hjust = 0.5)) ##DAY0 vs DAY14unsorted #contrast:Day14unsorted vs. Day0 hpsc.ec.object_res_Day14.unsorted_0 <- results(hpsc.ec.object, contrast = c("timepoints", "Day14.not.sorted", "Day.0"), alpha = 0.05, lfcThreshold = 1) #summary mcols(hpsc.ec.object_res_Day14.unsorted_0) summary(hpsc.ec.object_res_Day14.unsorted_0) #MA plot plotMA(hpsc.ec.object_res_Day14.unsorted_0, ylim = c(-8,8)) #shrink contrast:Day14.unsorted&0 hpsc.ec.object_res_Day14.unsorted_0_shrink <- lfcShrink(hpsc.ec.object, contrast = c("timepoints", "Day14.not.sorted", "Day.0"), res=hpsc.ec.object_res_Day14.unsorted_0) #summary mcols(hpsc.ec.object_res_Day14.unsorted_0_shrink) #shrink MA plot plotMA(hpsc.ec.object_res_Day14.unsorted_0_shrink, ylim = c(-8,8), main="Day0 vs Day14.not.sorted") abline(h=c(-1, 1),col="dodgerblue",lwd=2) #DE genes summary summary(hpsc.ec.object_res_Day14.unsorted_0_shrink) #DESeq2 shrink data frame hpsc.ec.object_result_Day14.unsorted_0 <- data.frame(hpsc.ec.object_res_Day14.unsorted_0_shrink) hpsc.ec.object_result_Day14.unsorted_0$symbol <- rownames(hpsc.ec.object_result_Day14.unsorted_0) hpsc.ec.object_result_Day14.unsorted_0 <- left_join(x=hpsc.ec.object_result_Day14.unsorted_0, y=grch38[,c("ensgene", "symbol", "description")], by = "symbol") #DESeq2 shrink DE genes hpsc.ec.object_result_Day14.unsorted_0_sig_original <- subset(hpsc.ec.object_result_Day14.unsorted_0, padj < 0.05) hpsc.ec.object_result_Day14.unsorted_0_sig_original <- subset(hpsc.ec.object_result_Day14.unsorted_0_sig_original, abs(log2FoldChange) > 1) #in order of padj hpsc.ec.object_result_Day14.unsorted_0_sig <- hpsc.ec.object_result_Day14.unsorted_0_sig_original %>% arrange(padj) #upregulated hpsc.ec.object_result_Day14.unsorted_0_upregulated <- hpsc.ec.object_result_Day14.unsorted_0_sig %>% filter(log2FoldChange > 0) #downregulated hpsc.ec.object_result_Day14.unsorted_0_downregulated <- hpsc.ec.object_result_Day14.unsorted_0_sig %>% filter(log2FoldChange < 0) #volcano data.frame from all results volcano_Day14.unsorted_0 <- data.frame(hpsc.ec.object_result_Day14.unsorted_0) %>% mutate(threshold = padj < 0.05) %>% arrange(padj) #volcano plot ggplot(volcano_Day14.unsorted_0) + geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) + geom_text_repel(data=head(volcano_Day14.unsorted_0, 50), aes(x=log2FoldChange, y=-log10(padj), label=symbol)) + xlab("log2 fold change") + ylab("-log10 adjusted p-value") + xlim(c(-10,10)) + ylim(c(0,200)) + ggtitle("Day0 vs Day14.unsorted") + scale_color_manual(values=c("black", "red")) + theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5), axis.title = element_text(size = rel(1.25))) #DESeq2 visualization - heatmap heatmap_general_Day14.unsorted_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day14.unsorted_0_sig$symbol,] pheatmap(heatmap_general_Day14.unsorted_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day14.unsorted", treeheight_row = 0, treeheight_col = 0, col = heat_colors) #subset normalized counts to significant genes sig_norm_counts_hpsc.ec_Day14.unsorted_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day14.unsorted_0_sig$symbol,c(1:2,9:10)] #heatmap plot: general scheme pheatmap(sig_norm_counts_hpsc.ec_Day14.unsorted_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day14.unsorted", treeheight_row = 0, treeheight_col = 0) #topgenes top_genes_Day14.unsorted_0 <- sig_norm_counts_hpsc.ec_Day14.unsorted_0[c(1:50),] pheatmap(top_genes_Day14.unsorted_0, cluster_rows = T, cluster_cols = F, show_rownames = T, show_colnames = T, scale = "row", main = "Day0 vs Day14.unsorted", treeheight_row = 0, treeheight_col = 0) #Expression plot top20_Day14.unsorted_0 <- data.frame(sig_norm_counts_hpsc.ec_Day14.unsorted_0)[1:20,] %>% rownames_to_column(var = "symbol") top20_Day14.unsorted_0 <- gather(top20_Day14.unsorted_0, key = "samplename", value = "normalized_counts", 2:5) top20_Day14.unsorted_0 <- inner_join(top20_Day14.unsorted_0, rownames_to_column(meta.condition, var = "samplename"), by = "samplename") ggplot(top20_Day14.unsorted_0) + geom_point(aes(x = symbol, y = normalized_counts, color = timepoints)) + scale_y_log10() + xlab("Genes") + ylab("Normalized Counts") + ggtitle("Top 20 Significant DE Genes: Day0 vs Day14.unsorted") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(plot.title = element_text(hjust = 0.5)) ##DAY0 vs DAY14.sorted #contrast:Day14sorted vs Day0 hpsc.ec.object_res_Day14.sorted_0 <- results(hpsc.ec.object, contrast = c("timepoints", "Day14.sorted", "Day.0"), alpha = 0.05, lfcThreshold = 1) #summary mcols(hpsc.ec.object_res_Day14.sorted_0) summary(hpsc.ec.object_res_Day14.sorted_0) #MA plot plotMA(hpsc.ec.object_res_Day14.sorted_0, ylim = c(-8,8)) #shrink contrast:Day4&0 hpsc.ec.object_res_Day14.sorted_0_shrink <- lfcShrink(hpsc.ec.object, contrast = c("timepoints", "Day14.sorted", "Day.0"), res=hpsc.ec.object_res_Day14.sorted_0) #summary mcols(hpsc.ec.object_res_Day14.sorted_0_shrink) #shrink MA plot plotMA(hpsc.ec.object_res_Day14.sorted_0_shrink, ylim = c(-8,8), main="Day0 vs Day14.sorted") abline(h=c(-1, 1),col="dodgerblue",lwd=2) #DE genes summary summary(hpsc.ec.object_res_Day14.sorted_0_shrink) #DESeq2 shrink data frame hpsc.ec.object_result_Day14.sorted_0 <- data.frame(hpsc.ec.object_res_Day14.sorted_0_shrink) hpsc.ec.object_result_Day14.sorted_0$symbol <- rownames(hpsc.ec.object_result_Day14.sorted_0) hpsc.ec.object_result_Day14.sorted_0 <- left_join(x=hpsc.ec.object_result_Day14.sorted_0, y=grch38[,c("ensgene", "symbol", "description")], by = "symbol") #DESeq2 shrink DE genes hpsc.ec.object_result_Day14.sorted_0_sig_original <- subset(hpsc.ec.object_result_Day14.sorted_0, padj < 0.05) hpsc.ec.object_result_Day14.sorted_0_sig_original <- subset(hpsc.ec.object_result_Day14.sorted_0_sig_original, abs(log2FoldChange) > 1) #in order of padj hpsc.ec.object_result_Day14.sorted_0_sig <- hpsc.ec.object_result_Day14.sorted_0_sig_original %>% arrange(padj) #upregulated hpsc.ec.object_result_Day14.sorted_0_upregulated <- hpsc.ec.object_result_Day14.sorted_0_sig %>% filter(log2FoldChange > 0) #downregulated hpsc.ec.object_result_Day14.sorted_0_downregulated <- hpsc.ec.object_result_Day14.sorted_0_sig %>% filter(log2FoldChange < 0) #export write_xlsx(hpsc.ec.object_result_Day14.sorted_0_sig, "DE_Day0vs14.sorted.xlsx") write_xlsx(hpsc.ec.object_result_Day14.sorted_0_upregulated, "DE_Day0vs14.sorted_Upregulate.xlsx") write_xlsx(hpsc.ec.object_result_Day14.sorted_0_downregulated, "DE_Day0vs14.sorted_Downregulate.xlsx") #volcano data.frame from all results volcano_Day14.sorted_0 <- data.frame(hpsc.ec.object_result_Day14.sorted_0) %>% mutate(threshold = padj < 0.05) %>% arrange(padj) #volcano plot ggplot(volcano_Day14.sorted_0) + geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) + geom_text_repel(data=head(volcano_Day14.sorted_0, 50), aes(x=log2FoldChange, y=-log10(padj), label=symbol)) + xlab("log2 fold change") + ylab("-log10 adjusted p-value") + xlim(c(-10,10)) + ylim(c(0,200)) + ggtitle("Day0 vs Day14.sorted") + scale_color_manual(values=c("black", "red")) + theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5), axis.title = element_text(size = rel(1.25))) #DESeq2 visualization - heatmap heatmap_general_Day14.sorted_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day14.sorted_0_sig$symbol,] pheatmap(heatmap_general_Day14.sorted_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day14.sorted", treeheight_row = 0, treeheight_col = 0, col = heat_colors) #subset normalized counts to significant genes sig_norm_counts_hpsc.ec_Day14.sorted_0 <- normalized_hpsc.ec.object_counts[hpsc.ec.object_result_Day14.sorted_0_sig$symbol,c(1:2,11:12)] #heatmap plot: general scheme pheatmap(sig_norm_counts_hpsc.ec_Day14.sorted_0, cluster_rows = T, cluster_cols = F, show_rownames = F, show_colnames = T, scale = "row", main = "Day0 vs Day14.sorted", treeheight_row = 0, treeheight_col = 0) #topgenes top_genes_Day14.sorted_0 <- sig_norm_counts_hpsc.ec_Day14.sorted_0[c(1:50),] pheatmap(top_genes_Day14.sorted_0, cluster_rows = T, cluster_cols = F, show_rownames = T, show_colnames = T, scale = "row", main = "Day0 vs Day14.sorted", treeheight_row = 0, treeheight_col = 0) #Expression plot top20_Day14.sorted_0 <- data.frame(sig_norm_counts_hpsc.ec_Day14.sorted_0)[1:20,] %>% rownames_to_column(var = "symbol") top20_Day14.sorted_0 <- gather(top20_Day14.sorted_0, key = "samplename", value = "normalized_counts", 2:5) top20_Day14.sorted_0 <- inner_join(top20_Day14.sorted_0, rownames_to_column(meta.condition, var = "samplename"), by = "samplename") ggplot(top20_Day14.sorted_0) + geom_point(aes(x = symbol, y = normalized_counts, color = timepoints)) + scale_y_log10() + xlab("Genes") + ylab("Normalized Counts") + ggtitle("Top 20 Significant DE Genes: Day0 vs Day14.sorted") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(plot.title = element_text(hjust = 0.5)) ###GO Enrichment Analysis ##Venn Diagram # area1: Day0 vs Day8 Upregulated Day8_0_up <- hpsc.ec.object_result_Day8_0_upregulated %>% pull(symbol) # area2: Day0 vs Day14.unsorted Upregulated length Day14.unsorted_Day0_up <- hpsc.ec.object_result_Day14.unsorted_0_upregulated %>% pull(symbol) # area3: Day0 vs Day14.sorted Upregulated length Day14.sorted_Day0_up <- hpsc.ec.object_result_Day14.sorted_0_upregulated %>% pull(symbol) #VENNDIAGRAM GroupA_up <- Day8_0_up GroupB_up <- Day14.unsorted_Day0_up GroupC_up <- Day14.sorted_Day0_up v1 <- venn.diagram(list(Day8=GroupA_up, Day14.unsorted=GroupB_up, Day14.sorted=GroupC_up), filename=NULL, fill=rainbow(3), lwd=2, cex=2, cat.cex=1.1) grid.newpage() grid.draw(v1) #area1: Day0 vs Day8 downregulated Day8_0_down <- hpsc.ec.object_result_Day8_0_downregulated %>% pull(symbol) #area2: Day0 vs Day14.unsorted downregulated length Day14.unsorted_Day0_down <- hpsc.ec.object_result_Day14.unsorted_0_downregulated %>% pull(symbol) #area3: Day0 vs Day14.sorted downregulated length Day14.sorted_Day0_down <- hpsc.ec.object_result_Day14.sorted_0_downregulated %>% pull(symbol) #VENNDIAGRAM GroupA_down <- Day8_0_down GroupB_down <- Day14.unsorted_Day0_down GroupC_down <- Day14.sorted_Day0_down v2 <- venn.diagram(list(Day8=GroupA_down, Day14.unsorted=GroupB_down, Day14.sorted=GroupC_down), filename=NULL, fill=rainbow(3), lwd=2, cex=2, cat.cex=1.1) grid.newpage() grid.draw(v2) ###GO Enrichment Analysis based of DE Analysis by Wald Test ##Day8 Upregulated #geneID for gene symbols universe_idx_Day8_up <- grch38$symbol %in% hpsc.ec.object_result_Day8_0$symbol universe_ids_Day8_up <- grch38[universe_idx_Day8_up,] universe_non_duplicates_Day8 <- which(duplicated(universe_ids_Day8_up$symbol) == FALSE) universe_ids_Day8_up <- universe_ids_Day8_up[universe_non_duplicates_Day8,] allOE_genes_Day8_up <- as.character(universe_ids_Day8_up$ensgene) #geneID for sig gene symbols idx_Day8_up <- grch38$symbol %in% hpsc.ec.object_result_Day8_0_upregulated$symbol ids_Day8_up <- grch38[idx_Day8_up, ] non_duplicates_Day8 <- which(duplicated(ids_Day8_up$symbol) == FALSE) ids_Day8_up <- ids_Day8_up[non_duplicates_Day8, ] sigOE_genes_Day8_up <- as.character(ids_Day8_up$ensgene) #background dataset sigOE_genes_Day8_up <- as.character(ids_Day8_up$ensgene) #GO enrichment analysis ego_Day8_up <- enrichGO(gene = sigOE_genes_Day8_up, universe = allOE_genes_Day8_up, keyType = "ENSEMBL", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.05, readable = TRUE) cluster_summary_Day8_up <- data.frame(ego_Day8_up) write_xlsx(cluster_summary_Day8_up, "GO Enrichment Analysis Day0 vs. Day8.xlsx") #dotplot dotplot(ego_Day8_up, showCategory=50) #emapplot emapplot(ego_Day8_up, showCategory = 50) #####Day14_unsorted Upregulated #geneID for gene symbols universe_idx_Day14unsorted_up <- grch38$symbol %in% hpsc.ec.object_result_Day14.unsorted_0$symbol universe_ids_Day14unsorted_up <- grch38[universe_idx_Day14unsorted_up,] universe_non_duplicates_Day14unsorted <- which(duplicated(universe_ids_Day14unsorted_up$symbol) == FALSE) universe_ids_Day14unsorted_up <- universe_ids_Day14unsorted_up[universe_non_duplicates_Day14unsorted,] allOE_genes_Day14unsorted_up <- as.character(universe_ids_Day14unsorted_up$ensgene) #remove duplicate IDs idx_Day14unsorted_up <- grch38$symbol %in% hpsc.ec.object_result_Day14.unsorted_0_upregulated$symbol ids_Day14unsorted_up <- grch38[idx_Day14unsorted_up, ] non_duplicates_Day14unsorted <- which(duplicated(ids_Day14unsorted_up$symbol) == FALSE) ids_Day14unsorted_up <- ids_Day14unsorted_up[non_duplicates_Day14unsorted, ] sigOE_genes_Day14unsorted_up <- as.character(ids_Day14unsorted_up$ensgene) #background dataset sigOE_genes_Day14unsorted_up <- as.character(ids_Day14unsorted_up$ensgene) #run GO enrichment analysis ego_Day14unsorted_up <- enrichGO(gene = sigOE_genes_Day14unsorted_up, universe = allOE_genes_Day14unsorted_up, keyType = "ENSEMBL", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.05, readable = TRUE) cluster_summary_Day14unsorted_up <- data.frame(ego_Day14unsorted_up) write_xlsx(cluster_summary_Day14unsorted_up, "GO Enrichment Analysis Day0 vs. Day14.unsorted.xlsx") pdf("GO Enrichment Analysis Dot Plot Day0 vs. Day14.sorted.pdf",8,14) dotplot(ego_Day14unsorted_up, showCategory=50) dev.off() pdf("GO Enrichment Analysis Enrichmap Day0 vs. Day14.unsorted.pdf", 24, 32) emapplot(ego_Day14unsorted_up, showCategory = 50) dev.off() ###Day14_sorted Upregulated #gene symbol into IDs universe_idx_Day14sorted_up <- grch38$symbol %in% hpsc.ec.object_result_Day14.sorted_0$symbol universe_ids_Day14sorted_up <- grch38[universe_idx_Day14sorted_up,] universe_non_duplicates_Day14sorted <- which(duplicated(universe_ids_Day14sorted_up$symbol) == FALSE) universe_ids_Day14sorted_up <- universe_ids_Day14sorted_up[universe_non_duplicates_Day14sorted,] allOE_genes_Day14sorted_up <- as.character(universe_ids_Day14sorted_up$ensgene) #remove duplicates idx_Day14sorted_up <- grch38$symbol %in% hpsc.ec.object_result_Day14.sorted_0_upregulated$symbol ids_Day14sorted_up <- grch38[idx_Day14sorted_up, ] non_duplicates_Day14sorted <- which(duplicated(ids_Day14sorted_up$symbol) == FALSE) ids_Day14sorted_up <- ids_Day14sorted_up[non_duplicates_Day14sorted, ] sigOE_genes_Day14sorted_up <- as.character(ids_Day14sorted_up$ensgene) #run GO enrichment analysis ego_Day14sorted_up <- enrichGO(gene = sigOE_genes_Day14sorted_up, universe = allOE_genes_Day14sorted_up, keyType = "ENSEMBL", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.05, readable = TRUE) cluster_summary_Day14sorted_up <- data.frame(ego_Day14sorted_up) write_xlsx(cluster_summary_Day14sorted_up, "GO Enrichment Analysis Day0 vs. Day14.sorted.xlsx") pdf("GO Enrichment Analysis Dot Plot Day0 vs. Day14.sorted.pdf",8,14) dotplot(ego_Day14sorted_up, showCategory=50) dev.off() pdf("GO Enrichment Analysis Enrichmap Day0 vs. Day14.unsorted.pdf", 24, 32) emapplot(ego_Day14sorted_up, showCategory = 50) dev.off() ###Day8 DOWNregulated #gene symbols into ID universe_idx_Day8_down <- grch38$symbol %in% hpsc.ec.object_result_Day8_0$symbol universe_ids_Day8_down <- grch38[universe_idx_Day8_down,] universe_non_duplicates_Day8_down <- which(duplicated(universe_ids_Day8_down$symbol) == FALSE) universe_ids_Day8_down <- universe_ids_Day8_down[universe_non_duplicates_Day8_down,] allOE_genes_Day8_down <- as.character(universe_ids_Day8_down$ensgene) #remove duplicate ID idx_Day8_down <- grch38$symbol %in% hpsc.ec.object_result_Day8_0_downregulated$symbol ids_Day8_down <- grch38[idx_Day8_down, ] non_duplicates_Day8_down <- which(duplicated(ids_Day8_down$symbol) == FALSE) ids_Day8_down <- ids_Day8_down[non_duplicates_Day8_down, ] sigOE_genes_Day8_down <- as.character(ids_Day8_down$ensgene) #background dataset (all genes tested) for hypergeometric testing sigOE_genes_Day8_down <- as.character(ids_Day8_down$ensgene) #GO enrichment analysis ego_Day8_down <- enrichGO(gene = sigOE_genes_Day8_down, universe = allOE_genes_Day8_down, keyType = "ENSEMBL", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.05, readable = TRUE) cluster_summary_Day8_down <- data.frame(ego_Day8_down) write_xlsx(cluster_summary_Day8_down, "GO Enrichment Analysis Day0 vs. Day8 DOWN.xlsx") pdf("GO Enrichment Analysis Dot Plot Day0 vs. Day8 DOWN.pdf",8,14) dotplot(ego_Day8_down, showCategory=50) dev.off() pdf("GO Enrichment Analysis Enrichmap Day0 vs. Day8.pdf", 24, 32) emapplot(ego_Day8_down, showCategory = 50) dev.off() ##Day14_unsorted DOWNregulated #gene symbol into ID universe_idx_Day14unsorted_down <- grch38$symbol %in% hpsc.ec.object_result_Day14.unsorted_0$symbol universe_ids_Day14unsorted_down <- grch38[universe_idx_Day14unsorted_down,] universe_non_duplicates_Day14unsorted_down <- which(duplicated(universe_ids_Day14unsorted_down$symbol) == FALSE) universe_ids_Day14unsorted_down <- universe_ids_Day14unsorted_down[universe_non_duplicates_Day14unsorted_down,] allOE_genes_Day14unsorted_down <- as.character(universe_ids_Day14unsorted_down$ensgene) #remove duplicate ID idx_Day14unsorted_down <- grch38$symbol %in% hpsc.ec.object_result_Day14.unsorted_0_downregulated$symbol ids_Day14unsorted_down <- grch38[idx_Day14unsorted_down, ] non_duplicates_Day14unsorted_down <- which(duplicated(ids_Day14unsorted_down$symbol) == FALSE) ids_Day14unsorted_down <- ids_Day14unsorted_down[non_duplicates_Day14unsorted_down, ] sigOE_genes_Day14unsorted_down <- as.character(ids_Day14unsorted_down$ensgene) #background dataset (all genes tested) for hypergeometric testing sigOE_genes_Day14unsorted_down <- as.character(ids_Day14unsorted_down$ensgene) #GO Enrichment Analysis ego_Day14unsorted_down <- enrichGO(gene = sigOE_genes_Day14unsorted_down, universe = allOE_genes_Day14unsorted_down, keyType = "ENSEMBL", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.05, readable = TRUE) cluster_summary_Day14unsorted_down <- data.frame(ego_Day14unsorted_down) write_xlsx(cluster_summary_Day14unsorted_down, "GO Enrichment Analysis Day0 vs. Day14.unsorted Down.xlsx") pdf("GO Enrichment Analysis Dot Plot Day0 vs. Day14.sorted DOWN.pdf",8,14) dotplot(ego_Day14unsorted_down, showCategory=50) dev.off() pdf("GO Enrichment Analysis Enrichmap Day0 vs. Day14.unsorted DOWN.pdf", 24, 32) emapplot(ego_Day14unsorted_down, showCategory = 50) dev.off() ##Day14_sorted Downregulated #gene symbols into ID universe_idx_Day14sorted_down <- grch38$symbol %in% hpsc.ec.object_result_Day14.sorted_0$symbol universe_ids_Day14sorted_down <- grch38[universe_idx_Day14sorted_down,] universe_non_duplicates_Day14sorted_down <- which(duplicated(universe_ids_Day14sorted_down$symbol) == FALSE) universe_ids_Day14sorted_down <- universe_ids_Day14sorted_down[universe_non_duplicates_Day14sorted_down,] allOE_genes_Day14sorted_down <- as.character(universe_ids_Day14sorted_down$ensgene) #remove duplicate ID idx_Day14sorted_down <- grch38$symbol %in% hpsc.ec.object_result_Day14.sorted_0_downregulated$symbol ids_Day14sorted_down <- grch38[idx_Day14sorted_down, ] non_duplicates_Day14sorted_down <- which(duplicated(ids_Day14sorted_down$symbol) == FALSE) ids_Day14sorted_down <- ids_Day14sorted_down[non_duplicates_Day14sorted_down, ] sigOE_genes_Day14sorted_down <- as.character(ids_Day14sorted_down$ensgene) #GO Enrichment Analysis ego_Day14sorted_down <- enrichGO(gene = sigOE_genes_Day14sorted_down, universe = allOE_genes_Day14sorted_down, keyType = "ENSEMBL", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.05, readable = TRUE) cluster_summary_Day14sorted_down <- data.frame(ego_Day14sorted_down) write_xlsx(cluster_summary_Day14sorted_down, "GO Enrichment Analysis Day0 vs. Day14.sorted Down.xlsx") pdf("GO Enrichment Analysis Dot Plot Day0 vs. Day14.sorted DOWN.pdf",8,14) dotplot(ego_Day14sorted_down, showCategory=50) dev.off() pdf("GO Enrichment Analysis Enrichmap Day0 vs. Day14.unsorted DOWN.pdf", 24, 32) emapplot(ego_Day14sorted_down, showCategory = 50) dev.off()