487 words

Computational Genomics Practical (TT 2020)

1 Quiz 5

1.1 Heatmaps (using ggplot2 in R)

Loading Packages

library(R.matlab)
## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.
## 
## Attaching package: 'R.matlab'
## The following objects are masked from 'package:base':
## 
##     getOption, isOpen
library(reshape2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Loading Data

setwd('/Users/tianyishi/Documents/GitHub/ox/content/lab/src/Y2T3W7-genomics')
H3K4me3 <- readMat('H3K4me3_ChIP_seq_glucose_gene_levels.mat')$H3K4me3.ChIP.seq.glucose.gene.levels
MNase <- readMat('MNase_seq_glucose_gene_levels.mat')$MNase.seq.glucose.gene.levels
NET <- readMat('NET_seq_glucose_gene_levels_sense_strand.mat')$NET.seq.glucose.gene.levels.sense.strand
NET[NET < -0.5 | NET > 5] = NA
plot_heatmap <- function(mat, title) {
  return(
    mat %>% 
    melt() %>% 
    ggplot(aes(Var2, Var1, fill=value))+
      geom_tile()+
      scale_x_continuous(expand = c(0,0), breaks = c(1, 501, 1001, 1501), labels = c('-500', 'TSS', '+500', '+1000'))+
      scale_y_continuous(expand = c(0,0))+
      scale_fill_gradient(low="blue", high = "yellow")+
      labs(title=title,
           x='Position relative to TSS (bp)',
           y='Gene number')
  )
}
plot_heatmap(H3K4me3, 'H3K4me3 ChIP-seq Glucose')

plot_heatmap(MNase, "MNase")

plot_heatmap(NET, 'NET')

1.2 Heatmaps in Python (with seaborn)

Plotting above heatmaps using ggplot2 in R is very slow. Using seaborn in Python is faster (plots not displayed):

from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

MNase = np.array(loadmat("MNase_seq_glucose_gene_levels.mat")["MNase_seq_glucose_gene_levels"])
H3K4me3 = np.array(loadmat("H3K4me3_ChIP_seq_glucose_gene_levels.mat")["H3K4me3_ChIP_seq_glucose_gene_levels"])
NET = readMat('NET_seq_glucose_gene_levels_sense_strand.mat')['NET_seq_glucose_gene_levels_sense_strand.mat']

def plot_heatmap(mat, title):
    p = sns.heatmap(mat)
    p.set(
        title=title,
        xlabel='Position relative to TSS (bp)', ylabel='Gene number',
        xticks=[0, 500, 1000, 1500],
        xticklabels=["-500", "TSS", "+500", "+1000"],
        yticks=[0, 999, 1999, 2999, 3999, 4999],
        yticklabels=[1, 1000, 2000, 3000, 4000, 5000],
    )
    return p

plot_heatmap(MNase, "MNase seq Glucose")
plt.show()
plot_heatmap(H3K4me3, 'H3K4me3 ChIP-seq Glucose')
plt.show()
plot_heatmap(NET, 'NET-seq Glucose sense strand')
plt.show()

1.3 Average Gene Profile

xylabs <- labs(x='Position relative to TSS (bp)', y='average NET-seq level')
xaxis <- scale_x_continuous(breaks = c(0, 500, 1000, 1500), labels = c('-500', 'TSS', '+500', '+1000'))

# Average gene profile MNase-seq Glucose
tibble(x=0:1500, y=apply(MNase, 2, mean)) %>% 
  ggplot(aes(x, y))+
  geom_line()+
  xaxis+
  xylabs+
  labs(title='Average gene profile MNase-seq Glucose')

# NET-seq data highest and lowest H3K4me3 comparison
tibble() %>% 
  bind_rows(tibble(x=0:1500, y=apply(NET[1:501,], 2, function(x){mean(x, na.rm = TRUE)})) %>% add_column(`H3K4me3 level`='high')) %>% 
  bind_rows(tibble(x=0:1500, y=apply(NET[4501:5000,], 2, function(x){mean(x, na.rm = TRUE)})) %>% add_column(`H3K4me3 level`='low')) %>%
  ggplot(aes(x, y, color=`H3K4me3 level`))+
    geom_line()+
    xylabs+xaxis+
    labs(title='NET-seq data highest and lowest H3K4me3 comparison')

2 Quiz 8

2.1 Matlab

pos=yeast_gene_positions(:,[1,2,5])
% three colomns correspond to chromosome number, direction and TSS
avgs = []
i = 1
for p = pos.'
  chr = NET_seq_data{p(1) + 16 * (1 - p(2))};
  if p(2)==1
    seq = chr(p(3):(p(3) + 299), :);
  else
    seq = chr((p(3) - 299):p(3), :);
  end
  avgs(i,:) = mean(seq);
  i = i + 1;
end
corr(avgs,yeast_gene_measurements(:,1))

2.2 Using R

setwd('/Users/tianyishi/Documents/GitHub/ox/content/lab/src/Y2T3W7-genomics')
pos <- readMat('yeast_gene_positions.mat')$yeast.gene.positions[,c(1,2,5)]
net <- lapply(readMat('NET_seq_genome_wide_data.mat')$NET.seq.data, as_vector)
clc <- apply(pos, 1, function(x){
  mean(net[[x[1] + (1-x[2])*16]][x[3]:(x[3]+ifelse(x[2]==0, -1, 1)*299)])
})
exp <- readMat('yeast_gene_measurements.mat')$yeast.gene.measurements[,1]

plot(clc, exp)

cor(clc, exp)
## [1] 0.8281284