毕设代码
other-other毕设用到的代码汇总
与最初版本的主要区别:
- 直接用最新的hERV计数结果构建Seurat对象,而不是将新数据覆盖旧数据
- 筛选hERV位点时不根据细胞表达比例,保留除”–no-feature”以外的所有hERV位点,让后面按family聚合时更准确
- 使用
SCTransform+integration流程做数据预处理,尽量和论文对齐,尽量消除测序深度的影响 - 修改“构建hERV位点-家族映射”代码,subfamily仅从gtf文件的gene行提取,不再自己根据subfamily名称合并家族,所有的subfamily和class都从gtf文件中获取,且最后只留这两个assay(和原本的位点assay)
- 为所有的DE分析添加/修改协变量为”nCount_RNA”+”percent_mito”,方法使用MAST
计数
使用的工具:
- parallel-fastq-dump
- STAR
- stellarscope
数据下载:
计数:
构建Seurat对象和数据预处理
# Seurat
library(Seurat)
library(glmGamPoi)
library(MAST)
# tidyverse+画图相关
library(tidyverse)
library(readxl)
library(ggpubr)
library(ggrepel)
library(patchwork)
library(clustree)
library(pheatmap)
# 矩阵/df处理
library(Matrix)
library(data.table)
# GO
library(clusterProfiler)
library(org.Hs.eg.db)
# 并行运算
library(future)
plan("multiprocess", workers = 4) # 启用并行处理(FindIntegrationAnchors步骤可能需要数小时),但需考虑服务器兼容性(例如内存不足可能导致Rsession卡死)
options(future.globals.maxSize = 100000 * 1024^5) # 如果出现有关future的内存报错就执行这句
# 结果图目录
res_dir <- "/public/home/GENE_proc/wth/GSE157827/res/"
构建Seurat对象:
data_root <- "/public/home/GENE_proc/wth/GSE157827/mtx/"
meta_file <- "/public/home/GENE_proc/wth/GSE157827/raw_data/metadata/metadata.xlsx"
sample_file <- "/public/home/GENE_proc/wth/GSE157827/raw_data/metadata/sample_info.xlsx"
# 读取metadata
meta_map <- read_xlsx(meta_file)
sample_info <- read_xlsx(sample_file)
names(meta_map) <- make.names(names(meta_map))
names(sample_info) <- make.names(names(sample_info))
gsm_meta <- meta_map %>% # GSM级别信息
transmute(
GSM = as.character(sample_id),
group = as.character(diagnosis),
orig.ident = as.character(individuals),
tissue = as.character(tissue)
) %>%
distinct(GSM, .keep_all = TRUE)
donor_info <- sample_info %>% # 样本信息
mutate(orig.ident = as.character(sample_id)) %>%
select(-sample_id)
gsm_info <- gsm_meta %>% # 合并
left_join(donor_info, by = "orig.ident")
# 检查有无NA
any(is.na(gsm_info$orig.ident)) # FALSE
sum(is.na(gsm_info[[names(donor_info)[1]]])) # 0
nrow(gsm_info) # 21
gsm2indv <- setNames(gsm_info$orig.ident, gsm_info$GSM) # GSM -> orig.ident
# 读取计数矩阵
normalize_barcode <- function(x) { # 如果已经有`-数字`后缀(比如`-1`)就保留,否则补`-1`
ifelse(grepl("-\\d+$", x), x, paste0(x, "-1"))
}
read_one_gsm <- function(gsm, gsm_info, data_root) {
gene_dir <- file.path(data_root, gsm, "gene")
herv_dir <- file.path(data_root, gsm, "hERV_new")
# 读取矩阵
gene_counts <- ReadMtx(
mtx = file.path(gene_dir, "counts.mtx"),
features = file.path(gene_dir, "features.tsv"),
cells = file.path(gene_dir, "barcodes.tsv"),
unique.features = TRUE
)
herv_counts <- ReadMtx(
mtx = file.path(herv_dir, "counts.mtx"),
features = file.path(herv_dir, "features.tsv"),
cells = file.path(herv_dir, "barcodes.tsv"),
feature.column = 1,
unique.features = TRUE
)
# 取该GSM的样本信息
info <- gsm_info %>% filter(GSM == gsm)
indv <- info$orig.ident
# 使用原论文作者的细胞名格式
gene_bc <- normalize_barcode(colnames(gene_counts))
herv_bc <- normalize_barcode(colnames(herv_counts))
colnames(gene_counts) <- paste(gsm, indv, gene_bc, sep = "_")
colnames(herv_counts) <- paste(gsm, indv, herv_bc, sep = "_")
# 对齐两套assay的细胞集合
common_cells <- intersect(colnames(gene_counts), colnames(herv_counts))
gene_counts <- gene_counts[, common_cells, drop = FALSE]
herv_counts <- herv_counts[, common_cells, drop = FALSE]
# 构建Seurat对象
seu <- CreateSeuratObject(
counts = gene_counts,
assay = "RNA",
project = "GSE157827"
)
seu[["HERV"]] <- CreateAssayObject(counts = herv_counts)
# 添加metadata
seu$GSM <- gsm
seu$orig.ident <- indv
seu$group <- info$group
seu$tissue <- info$tissue
extra_cols <- setdiff(colnames(info), c("GSM", "group", "orig.ident", "tissue"))
for (cc in extra_cols) {
seu[[cc]] <- info[[cc]][1]
}
return(seu)
}
gsm_ids <- list.dirs(data_root, full.names = FALSE, recursive = FALSE)
seu_list <- list()
for (gsm in gsm_ids) {
tmp <- read_one_gsm(gsm, gsm_info = gsm_info, data_root = data_root)
if (!is.null(tmp)) {
seu_list[[gsm]] <- tmp
}
}
seu <- Reduce(function(x, y) merge(x, y), seu_list)
rm(seu_list)
# 合并每个layers
seu <- JoinLayers(seu, assay = "RNA")
Layers(seu, assay = "RNA") # 检查一下layers:"counts"
# 线粒体/核糖体比例
seu <- PercentageFeatureSet(seu, pattern = "^MT-", col.name = "percent_mito")
seu <- PercentageFeatureSet(seu, pattern = "^RP[SL]", col.name = "percent_ribo")
seu$HERV_fraction <- seu$nCount_HERV / (seu$nCount_HERV + seu$nCount_RNA)
# 查看结果情况
seu
colnames(seu@meta.data)
table(seu$orig.ident)
seu$group <- factor(seu$group, levels = c("NC", "AD"))
# 保存数据
saveRDS(
seu,
file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827.rds",
compress = "xz"
)
质控:
seu <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827.rds")
feats <- c("nFeature_RNA", "nCount_RNA")
# 过滤前
p_filt_b_1 <- VlnPlot(
seu, features = feats,
pt.size = 0,
ncol = 2,
group.by = "group"
) + NoLegend()
p_filt_b_2 <- VlnPlot(
seu, features = feats,
pt.size = 0,
ncol = 1,
group.by = "orig.ident"
) + NoLegend()
# HERV
feats <- c("nCount_HERV", "nFeature_HERV")
p_filt_b_3 <- VlnPlot(
seu, features = feats,
pt.size = 0,
ncol = 2,
group.by = "group"
) + NoLegend()
p_filt_b_4 <- VlnPlot(
seu, features = feats,
pt.size = 0,
ncol = 1,
group.by = "orig.ident"
) + NoLegend()
p_filt_b_7 <- VlnPlot(
seu, features = "HERV_fraction",
pt.size = 0,
ncol = 1,
group.by = "group"
) + NoLegend()
p_filt_b_8 <- VlnPlot(
seu, features = "HERV_fraction",
pt.size = 0,
ncol = 1,
group.by = "orig.ident"
) + NoLegend()
feats <- c("percent_mito", "percent_ribo")
p_filt_b_5 <- VlnPlot(
seu, features = feats,
pt.size = 0,
ncol = 2,
group.by = "group"
) + NoLegend()
p_filt_b_6 <- VlnPlot(
seu, features = feats,
pt.size = 0,
ncol = 1,
group.by = "orig.ident"
) + NoLegend()
pdf(file.path(res_dir, "before_filter.pdf"), width = 20, height = 20)
p <- (p_filt_b_1 / p_filt_b_2) | (p_filt_b_3 / p_filt_b_4) | (p_filt_b_5 / p_filt_b_6) | (p_filt_b_7 / p_filt_b_8)
print(p)
dev.off()
# 细胞阈值
retained_c_umi_low <- seu$nFeature_RNA > 500
retained_c_umi_high <- seu$nFeature_RNA < 8000
retained_c_count_low <- seu$nCount_RNA > 1000
retained_c_count_high <- seu$nCount_RNA < 40000
retained_c_mito <- seu$percent_mito < 14
retained_c_ribo <- seu$percent_ribo < 3
keep_cells <- retained_c_mito & retained_c_ribo & retained_c_umi_low & retained_c_umi_high
scRNA_cells <- subset(seu, cells = Cells(seu)[keep_cells])
counts_rna <- GetAssayData(scRNA_cells, assay = "RNA", slot = "counts")
feature_rowsum <- Matrix::rowSums(counts_rna > 0)
# 基因阈值
retained_f_low <- feature_rowsum > ncol(scRNA_cells) * 0.005
genes_keep <- names(feature_rowsum)[retained_f_low]
counts_herv <- GetAssayData(scRNA_cells, assay = "HERV", slot = "counts")
# hERV位点:去掉--no-feature
herv_keep <- names(herv_rowsum)
herv_keep <- herv_keep[! herv_keep %in% c("--no-feature", "__no_feature")]
# 过滤
scRNA_cells[["RNA"]] <- subset(scRNA_cells[["RNA"]], features = genes_keep)
scRNA_cells[["HERV"]] <- subset(scRNA_cells[["HERV"]], features = herv_keep)
scRNA_cells
table(scRNA_cells$group)
table(scRNA_cells$orig.ident)
rna_counts <- GetAssayData(scRNA_cells, assay = "RNA", slot = "counts")
scRNA_cells$nCount_RNA <- Matrix::colSums(rna_counts)
scRNA_cells$nFeature_RNA <- Matrix::colSums(rna_counts > 0)
herv_counts <- GetAssayData(scRNA_cells, assay = "HERV", slot = "counts")
scRNA_cells$nCount_HERV <- Matrix::colSums(herv_counts)
scRNA_cells$nFeature_HERV <- Matrix::colSums(herv_counts > 0)
scRNA_cells$HERV_fraction <- scRNA_cells$nCount_HERV / (scRNA_cells$nCount_HERV + scRNA_cells$nCount_RNA)
# 过滤后
feats <- c("nFeature_RNA", "nCount_RNA")
p_filt_a_1 <- VlnPlot(
scRNA_cells, features = feats,
pt.size = 0,
ncol = 2,
group.by = "group"
) + NoLegend()
p_filt_a_2 <- VlnPlot(
scRNA_cells, features = feats,
pt.size = 0,
ncol = 1,
group.by = "orig.ident"
) + NoLegend()
# HERV
feats <- c("nCount_HERV", "nFeature_HERV")
p_filt_a_3 <- VlnPlot(
scRNA_cells, features = feats,
pt.size = 0,
ncol = 2,
group.by = "group"
) + NoLegend()
p_filt_a_4 <- VlnPlot(
scRNA_cells, features = feats,
pt.size = 0,
ncol = 1,
group.by = "orig.ident"
) + NoLegend()
p_filt_a_7 <- VlnPlot(
scRNA_cells, features = "HERV_fraction",
pt.size = 0,
ncol = 1,
group.by = "group"
) + NoLegend()
p_filt_a_8 <- VlnPlot(
scRNA_cells, features = "HERV_fraction",
pt.size = 0,
ncol = 1,
group.by = "orig.ident"
) + NoLegend()
feats <- c("percent_mito", "percent_ribo")
p_filt_a_5 <- VlnPlot(
scRNA_cells, features = feats,
pt.size = 0,
ncol = 2,
group.by = "group"
) + NoLegend()
p_filt_a_6 <- VlnPlot(
scRNA_cells, features = feats,
pt.size = 0,
ncol = 1,
group.by = "orig.ident"
) + NoLegend()
pdf(file.path(res_dir, "after_filter.pdf"), width = 20, height = 20)
p <- (p_filt_a_1 / p_filt_a_2) | (p_filt_a_3 / p_filt_a_4) | (p_filt_a_5 / p_filt_a_6) | (p_filt_a_7 / p_filt_a_8)
print(p)
dev.off()
# 保存数据
saveRDS(
scRNA_cells,
file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds",
compress = "xz"
)
SCTransform+integration:
seu <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds")
DefaultAssay(seu) <- "RNA"
obj.list <- SplitObject(seu, split.by = "orig.ident") # 按样本拆分
obj.list <- lapply(obj.list, function(x) { # 每个样本分别做SCTransform
SCTransform(
x,
assay = "RNA",
new.assay.name = "SCT",
vars.to.regress = "percent_mito",
verbose = FALSE,
method = "glmGamPoi"
)
})
features <- SelectIntegrationFeatures( # 选integration features
object.list = obj.list,
nfeatures = 3000
)
obj.list <- PrepSCTIntegration(
object.list = obj.list,
anchor.features = features,
verbose = FALSE
)
anchors <- FindIntegrationAnchors( # 找anchors
object.list = obj.list,
normalization.method = "SCT",
anchor.features = features,
verbose = FALSE
)
seu <- IntegrateData( # 整合
anchorset = anchors,
normalization.method = "SCT",
verbose = FALSE
)
DefaultAssay(seu) <- "integrated" # 用integrated assay做降维和聚类
seu <- RunPCA(seu, npcs = 50, verbose = FALSE)
pdf(file.path(res_dir, "ElbowPlot_all.pdf"))
p <- ElbowPlot(seu, ndims = 50)
print(p)
dev.off()
pc.use <- 1:30 # 选主成分数量
seu <- RunUMAP(seu, dims = pc.use, verbose = FALSE)
p_dim_group <- DimPlot(seu, group.by = "group")
p_dim_gsm <- DimPlot(seu, group.by = "orig.ident", raster = TRUE)
seu <- FindNeighbors(seu, dims = pc.use, verbose = FALSE)
seu <- FindClusters(seu, resolution = c(0.01, 0.05, 0.1, 0.2, 0.5))
p_cutree <- clustree(seu@meta.data, prefix = "RNA_snn_res.")
resolution <- "RNA_snn_res.0.05" # 选分辨率
Idents(seu) <- seu[[resolution]]
p_dim_cluster <- DimPlot(seu, label = TRUE)
pdf(file.path(res_dir, "data_preprocess.pdf"), width = 16, height = 16)
p <- (p_dim_group | p_dim_gsm) / (p_cutree | p_dim_cluster)
print(p)
dev.off()
细胞类型注释:
DefaultAssay(seu) <- "SCT"
# marker基因
gene_list <- list(
Astro = c("AQP4", "ADGRV1", "GPC5", "RYR3"),
Endo = c("CLDN5", "ABCB1", "EBF1"),
Excit = c("CAMK2A", "CBLN2", "LDB2"),
Inhib = c("GAD1", "LHFPL3", "PCDH15"),
Mic = c("C3", "LRMDA", "DOCK8"),
Oligo = c("MBP", "PLP1", "ST18")
)
# 在各个聚类中的表达量
pdf(file.path(res_dir, "cluster_marker_gene.pdf"), width = 16, height = 12)
p <- DotPlot(seu, features = gene_list) +
theme(axis.text.x = element_text(angle = 60, vjust = 0.8, hjust = 0.8))
print(p)
dev.off()
# 计算每个cluster对每个细胞类型的marker-set的平均表达
score_by_cluster <- function(seu, gene_list, group_col) {
out <- lapply(names(gene_list), function(ct) {
feats <- intersect(gene_list[[ct]], rownames(seu))
if (length(feats) == 0) return(NULL)
avg <- AverageExpression(
seu,
assays = "RNA",
features = feats,
group.by = group_col,
slot = "data",
verbose = FALSE
)$RNA # features x clusters
data.frame(
cluster = colnames(avg),
celltype = ct,
score = colMeans(avg, na.rm = TRUE),
stringsAsFactors = FALSE
)
}) |> bind_rows()
tab <- out |>
pivot_wider(names_from = celltype, values_from = score) |>
arrange(as.numeric(as.character(cluster)))
mat <- as.matrix(tab[, intersect(names(gene_list), colnames(tab)), drop = FALSE])
tab$pred <- colnames(mat)[max.col(mat, ties.method = "first")]
tab
}
score_tab <- score_by_cluster(seu, gene_list, resolution)
View(score_tab)
# 指定细胞类型
cl_Astro <- c("2")
cl_Endo <- c("9")
cl_Mic <- c("7")
cl_Oligo <- c("0")
cl_Inhib <- c("4", "5", "6", "11")
# 其它细胞归为Excit
cl <- as.character(seu[[resolution]][, 1])
seu$celltype <- ifelse(cl %in% cl_Astro, "Astro",
ifelse(cl %in% cl_Endo, "Endo",
ifelse(cl %in% cl_Mic, "Mic",
ifelse(cl %in% cl_Oligo, "Oligo",
ifelse(cl %in% cl_Inhib, "Inhib", "Excit")))))
seu$celltype <- factor(seu$celltype, levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo"))
Idents(seu) <- seu$celltype
table(seu$celltype, seu$group)
# 保存数据
saveRDS(
seu,
file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_celltype.rds",
compress = "xz"
)
# 画图展示
ct_cols <- c(
Astro = "#1b9e77",
Endo = "#d95f02",
Excit = "#7570b3",
Inhib = "#e7298a",
Mic = "#66a61e",
Oligo = "#e6ab02"
)
dim_plot <- DimPlot( # 细胞类型在umap上的分布
seu,
reduction = "umap",
cols = ct_cols,
label = FALSE
)
ct_stat <- as.data.frame(table(seu$celltype)) # 统计各细胞类型的细胞数和比例
colnames(ct_stat) <- c("celltype", "total")
ct_stat$celltype <- factor(ct_stat$celltype, levels = levels(seu@active.ident))
ct_stat$percentage <- 100 * ct_stat$total / sum(ct_stat$total)
celltype_bar <- ggplot(ct_stat, aes(x = celltype, y = percentage, fill = celltype)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_manual(values = ct_cols) +
theme_classic() +
theme(axis.title.x = element_blank(), legend.position = "none") +
ylab("Proportion of total nuclei (%)")
genes_paper <- c( # 各细胞类型marker热图
"SLC1A2","ADGRV1","GPC5","RYR3","GFAP",
"CLDN5","FLT1","ABCB1","EBF1","MT2A",
"RALYL","KCNIP4","CBLN2","LDB2","KCNQ5",
"NXPH1","LHFPL3","PCDH15","GRIK1","ADARB2",
"LRMDA","DOCK8","ARHGAP24","ARHGAP15","PLXDC2",
"ST18","PLP1","CTNNA3","MBP","PIP4K2A"
)
genes_use <- intersect(genes_paper, rownames(seu))
# seu <- ScaleData(seu, features = genes_use, verbose = FALSE)
heatmap <- DoHeatmap(
seu,
features = genes_use,
disp.min = -1.5,
disp.max = 1.5,
angle = 0,
group.colors = unname(ct_cols)
) + scale_fill_gradientn(
colors = c("#f100ef", "black", "#f0e442"),
limits = c(-1.5, 1.5)
)
pdf(file.path(res_dir, "celltype.pdf"), width = 24, height = 16)
p <- dim_plot + celltype_bar / heatmap
print(p)
dev.off()
计算差异基因:
scRNA <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_celltype.rds")
DefaultAssay(scRNA) <- "SCT"
scRNA$group <- factor(scRNA$group, levels = c("NC","AD"))
scRNA$celltype <- factor(
scRNA$celltype,
levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo")
)
Idents(scRNA) <- scRNA$celltype
balance_cells <- function(obj, ct){
sub <- subset(obj, idents = ct)
ad <- WhichCells(sub, expression = group == "AD")
nc <- WhichCells(sub, expression = group == "NC")
if(length(ad) > 2000){
n <- min(length(ad), length(nc))
sub[, c(sample(ad, n), sample(nc, n))]
} else {
sub
}
}
cts <- levels(scRNA$celltype)
obj_bal <- merge(x = balance_cells(scRNA, cts[1]),
y = lapply(cts[-1], \(ct) balance_cells(scRNA, ct)))
obj_bal$celltype <- factor(
obj_bal$celltype,
levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo")
)
obj_bal <- JoinLayers(obj_bal, assay = "SCT")
obj_bal$compare <- paste(obj_bal$group, obj_bal$celltype, sep = "_")
scRNA$compare <- paste(scRNA$group, scRNA$celltype, sep = "_")
ct <- levels(obj_bal$celltype)
all_markers <- lapply(ct, function(x){
message("Running: ", x)
FindMarkers(
obj_bal,
group.by = "compare",
ident.1 = paste0("AD_", x),
ident.2 = paste0("NC_", x),
test.use = "MAST",
latent.vars = c("nCount_RNA", "percent_mito")
logfc.threshold = 0.25,
min.pct = 0.1
)
})
names(all_markers) <- ct
lapply(all_markers, nrow)
all_markers_sig <- lapply(all_markers, function(df){
df2 <- subset(
df,
p_val_adj < 0.1
)
return(df2)
})
View(all_markers_sig[["Astro"]])
save(all_markers_sig, file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_all_markers_sig.RData")
hERV按subfamily/class聚合+标准化:
# 构建位点-家族映射
get_attr <- function(x, key) {
pat <- paste0(key, ' "([^"]+)"')
m <- regexec(pat, x, perl = TRUE)
rr <- regmatches(x, m)
out <- rep(NA_character_, length(x))
ok <- lengths(rr) >= 2
out[ok] <- vapply(rr[ok], `[`, character(1), 2)
out
}
gtf_path <- "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\metadata\\hERV.gtf"
lines <- readLines(gtf_path, warn = FALSE)
lines <- lines[!grepl("^\\s*#", lines)]
lines <- lines[nzchar(lines)]
gtf <- data.table::fread(
text = lines,
sep = "\t",
header = FALSE,
quote = "",
fill = TRUE,
data.table = TRUE
)
gtf <- gtf[, 1:9]
setnames(gtf, c("seqname","source","feature","start","end","score","strand","frame","attribute"))
gtf[, gene_id := get_attr(attribute, "gene_id")]
gtf[, locus := get_attr(attribute, "locus")]
gtf[, intModel := get_attr(attribute, "intModel")]
gtf[, repName := get_attr(attribute, "repName")]
gtf[, repFamily := get_attr(attribute, "repFamily")]
gtf[, geneRegion := get_attr(attribute, "geneRegion")]
gene_dt <- gtf[feature == "gene", .(locus_id = gene_id, intModel)][!is.na(locus_id)] # gene行:intModel(subfamily)
exon_dt <- gtf[feature == "exon", .(locus_id = gene_id, repFamily)][!is.na(locus_id)] # exon行:repFamily(class)
repfam_dt <- exon_dt[!is.na(repFamily), # 每个位点的class:取出现最多的repFamily
.(repFamily = {
x <- repFamily
ux <- unique(x); ux[which.max(tabulate(match(x, ux)))]
}),
by = locus_id]
map <- merge(gene_dt, repfam_dt, by = "locus_id", all = TRUE)
map[, subfamily := intModel]
map[, class := repFamily]
map$subfamily_raw <- map$subfamily
map$subfamily <- gsub("-int", "", map$subfamily) # 去掉结尾的-int标记
herv_map <- unique(map[, .(locus_id, subfamily_raw, subfamily, class)])
write.csv(herv_map, file = "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\metadata\\hERV_locus2family.csv", row.names = F)
# 聚合
seu <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_celltype.rds")
herv_map <- read.csv("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\metadata\\hERV_locus2family.csv")
herv_map$locus_id <- gsub("_", "-", herv_map$locus_id)
herv_counts <- GetAssayData(seu, assay = "HERV", slot = "counts")
aggregate_herv_counts <- function(herv_counts, map, group_by = c("subfamily", "class")) {
group_by <- match.arg(group_by)
keep_loci <- intersect(rownames(herv_counts), map$locus_id)
map2 <- map[match(keep_loci, map$locus_id), ]
map2 <- map2[!is.na(map2[[group_by]]) & nzchar(map2[[group_by]]), ]
mat <- herv_counts[map2$locus_id, , drop = FALSE]
grp <- map2[[group_by]]
f <- factor(grp, levels = unique(grp))
G <- sparseMatrix(
i = seq_along(f),
j = as.integer(f),
x = 1,
dims = c(length(f), nlevels(f)),
dimnames = list(rownames(mat), levels(f))
)
out <- Matrix::t(G) %*% mat
out
}
herv_subfamily_counts <- aggregate_herv_counts(herv_counts, herv_map, group_by = "subfamily")
herv_class_counts <- aggregate_herv_counts(herv_counts, herv_map, group_by = "class")
# 将新计数矩阵的细胞与原来的对齐
align_to_cells <- function(mat, cells){
mat <- mat[, cells, drop = FALSE]
mat
}
cells <- colnames(seu)
herv_counts <- align_to_cells(herv_counts, cells)
herv_subfamily_counts <- align_to_cells(herv_subfamily_counts, cells)
herv_class_counts <- align_to_cells(herv_class_counts, cells)
# 加到seu上
replace_assay <- function(seu, assay_name, counts){
counts <- as(counts, "dgCMatrix")
seu[[assay_name]] <- CreateAssayObject(counts = counts)
seu
}
seu <- replace_assay(seu, "HERV_family", herv_subfamily_counts)
seu <- replace_assay(seu, "HERV_class", herv_class_counts)
# 标准化
normalize_herv_by_rna_depth <- function(seu, assay, scale.factor = 10000) {
rna_counts <- GetAssayData(seu, assay = "RNA", slot = "counts")
herv_counts <- GetAssayData(seu, assay = assay, slot = "counts")
lib_rna <- Matrix::colSums(rna_counts)
herv_norm <- t(t(herv_counts) / lib_rna) * scale.factor
herv_norm@x <- log1p(herv_norm@x)
seu <- SetAssayData(seu, assay = assay, slot = "data", new.data = herv_norm)
return(seu)
}
seu <- normalize_herv_by_rna_depth(seu, "HERV")
seu <- normalize_herv_by_rna_depth(seu, "HERV_family")
seu <- normalize_herv_by_rna_depth(seu, "HERV_class")
# 简单验证一下
herv_subfamily_counts[, 1]
sum(herv_subfamily_counts[, 1]) # 都是45
herv_class_counts[, 1]
sum(herv_class_counts[, 1])
herv_counts[, 1]
sum(herv_counts[, 1])
herv_subfamily_counts[, 2]
sum(herv_subfamily_counts[, 2]) # 都是15
herv_class_counts[, 2]
sum(herv_class_counts[, 2])
herv_counts[, 2]
sum(herv_counts[, 2])
herv_data <- GetAssayData(seu, assay = "HERV", slot = "data")
herv_data[, 1]
HERV_family_data <- GetAssayData(seu, assay = "HERV_family", slot = "data")
HERV_family_data[, 1]
HERV_class_data <- GetAssayData(seu, assay = "HERV_class", slot = "data")
HERV_class_data[, 1]
# 保存数据
saveRDS(
seu,
file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_final.rds",
compress = "xz"
)