毕设代码

other-other

毕设用到的代码汇总

与最初版本的主要区别：

直接用最新的hERV计数结果构建Seurat对象，而不是将新数据覆盖旧数据
筛选hERV位点时不根据细胞表达比例，保留除”–no-feature”以外的所有hERV位点，让后面按family聚合时更准确
使用SCTransform+integration流程做数据预处理，尽量和论文对齐，尽量消除测序深度的影响
修改“构建hERV位点-家族映射”代码，subfamily仅从gtf文件的gene行提取，不再自己根据subfamily名称合并家族，所有的subfamily和class都从gtf文件中获取，且最后只留这两个assay（和原本的位点assay）
为所有的DE分析添加/修改协变量为”nCount_RNA”+”percent_mito”，方法使用MAST

计数

使用的工具：

parallel-fastq-dump
STAR
stellarscope

数据下载：

计数：

构建Seurat对象和数据预处理

# Seurat
library(Seurat)
library(glmGamPoi)
library(MAST)
# tidyverse+画图相关
library(tidyverse)
library(readxl)
library(ggpubr)
library(ggrepel)
library(patchwork)
library(clustree)
library(pheatmap)
# 矩阵/df处理
library(Matrix)
library(data.table)
# GO
library(clusterProfiler)
library(org.Hs.eg.db)
# 并行运算
library(future)
plan("multiprocess", workers = 4)  # 启用并行处理（FindIntegrationAnchors步骤可能需要数小时），但需考虑服务器兼容性（例如内存不足可能导致Rsession卡死）
options(future.globals.maxSize = 100000 * 1024^5)  # 如果出现有关future的内存报错就执行这句
# 结果图目录
res_dir <- "/public/home/GENE_proc/wth/GSE157827/res/"

构建Seurat对象：

data_root <- "/public/home/GENE_proc/wth/GSE157827/mtx/"
meta_file <- "/public/home/GENE_proc/wth/GSE157827/raw_data/metadata/metadata.xlsx"
sample_file <- "/public/home/GENE_proc/wth/GSE157827/raw_data/metadata/sample_info.xlsx"
# 读取metadata
meta_map <- read_xlsx(meta_file)
sample_info <- read_xlsx(sample_file)
names(meta_map) <- make.names(names(meta_map))
names(sample_info) <- make.names(names(sample_info))
gsm_meta <- meta_map %>%  # GSM级别信息
  transmute(
    GSM = as.character(sample_id),
    group = as.character(diagnosis),
    orig.ident = as.character(individuals),
    tissue = as.character(tissue)
  ) %>%
  distinct(GSM, .keep_all = TRUE)
donor_info <- sample_info %>%  # 样本信息
  mutate(orig.ident = as.character(sample_id)) %>%
  select(-sample_id)
gsm_info <- gsm_meta %>%  # 合并
  left_join(donor_info, by = "orig.ident")
# 检查有无NA
any(is.na(gsm_info$orig.ident))  # FALSE
sum(is.na(gsm_info[[names(donor_info)[1]]]))  # 0
nrow(gsm_info)  # 21
gsm2indv <- setNames(gsm_info$orig.ident, gsm_info$GSM)  # GSM -> orig.ident
# 读取计数矩阵
normalize_barcode <- function(x) {  # 如果已经有`-数字`后缀（比如`-1`）就保留，否则补`-1`
  ifelse(grepl("-\\d+$", x), x, paste0(x, "-1"))
}
read_one_gsm <- function(gsm, gsm_info, data_root) {
  gene_dir <- file.path(data_root, gsm, "gene")
  herv_dir <- file.path(data_root, gsm, "hERV_new")
  # 读取矩阵
  gene_counts <- ReadMtx(
    mtx = file.path(gene_dir, "counts.mtx"),
    features = file.path(gene_dir, "features.tsv"),
    cells = file.path(gene_dir, "barcodes.tsv"),
    unique.features = TRUE
  )
  herv_counts <- ReadMtx(
    mtx = file.path(herv_dir, "counts.mtx"),
    features = file.path(herv_dir, "features.tsv"),
    cells = file.path(herv_dir, "barcodes.tsv"),
    feature.column = 1,
    unique.features = TRUE
  )
  # 取该GSM的样本信息
  info <- gsm_info %>% filter(GSM == gsm)
  indv <- info$orig.ident
  # 使用原论文作者的细胞名格式
  gene_bc <- normalize_barcode(colnames(gene_counts))
  herv_bc <- normalize_barcode(colnames(herv_counts))
  colnames(gene_counts) <- paste(gsm, indv, gene_bc, sep = "_")
  colnames(herv_counts) <- paste(gsm, indv, herv_bc, sep = "_")
  # 对齐两套assay的细胞集合
  common_cells <- intersect(colnames(gene_counts), colnames(herv_counts))
  gene_counts <- gene_counts[, common_cells, drop = FALSE]
  herv_counts <- herv_counts[, common_cells, drop = FALSE]
  # 构建Seurat对象
  seu <- CreateSeuratObject(
    counts = gene_counts,
    assay = "RNA",
    project = "GSE157827"
  )
  seu[["HERV"]] <- CreateAssayObject(counts = herv_counts)
  # 添加metadata
  seu$GSM <- gsm
  seu$orig.ident <- indv
  seu$group <- info$group
  seu$tissue <- info$tissue
  extra_cols <- setdiff(colnames(info), c("GSM", "group", "orig.ident", "tissue"))
  for (cc in extra_cols) {
    seu[[cc]] <- info[[cc]][1]
  }
  return(seu)
}
gsm_ids <- list.dirs(data_root, full.names = FALSE, recursive = FALSE)
seu_list <- list()
for (gsm in gsm_ids) {
  tmp <- read_one_gsm(gsm, gsm_info = gsm_info, data_root = data_root)
  if (!is.null(tmp)) {
    seu_list[[gsm]] <- tmp
  }
}
seu <- Reduce(function(x, y) merge(x, y), seu_list)
rm(seu_list)
# 合并每个layers
seu <- JoinLayers(seu, assay = "RNA")
Layers(seu, assay = "RNA")  # 检查一下layers："counts"
# 线粒体/核糖体比例
seu <- PercentageFeatureSet(seu, pattern = "^MT-", col.name = "percent_mito")
seu <- PercentageFeatureSet(seu, pattern = "^RP[SL]", col.name = "percent_ribo")
seu$HERV_fraction <- seu$nCount_HERV / (seu$nCount_HERV + seu$nCount_RNA)
# 查看结果情况
seu
colnames(seu@meta.data)
table(seu$orig.ident)
seu$group <- factor(seu$group, levels = c("NC", "AD"))
# 保存数据
saveRDS(
  seu, 
  file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827.rds", 
  compress = "xz"
)

质控：

seu <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827.rds")
feats <- c("nFeature_RNA", "nCount_RNA")
# 过滤前
p_filt_b_1 <- VlnPlot(
  seu, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_b_2 <- VlnPlot(
  seu, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
# HERV
feats <- c("nCount_HERV", "nFeature_HERV")
p_filt_b_3 <- VlnPlot(
  seu, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_b_4 <- VlnPlot(
  seu, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
p_filt_b_7 <- VlnPlot(
  seu, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "group"
) + NoLegend()
p_filt_b_8 <- VlnPlot(
  seu, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
feats <- c("percent_mito", "percent_ribo")
p_filt_b_5 <- VlnPlot(
  seu, features = feats, 
  pt.size = 0, 
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_b_6 <- VlnPlot(
  seu, features = feats, 
  pt.size = 0, 
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
pdf(file.path(res_dir, "before_filter.pdf"), width = 20, height = 20)
p <- (p_filt_b_1 / p_filt_b_2) | (p_filt_b_3 / p_filt_b_4) | (p_filt_b_5 / p_filt_b_6) | (p_filt_b_7 / p_filt_b_8)
print(p)
dev.off()
# 细胞阈值
retained_c_umi_low <- seu$nFeature_RNA > 500
retained_c_umi_high <- seu$nFeature_RNA < 8000
retained_c_count_low  <- seu$nCount_RNA > 1000
retained_c_count_high <- seu$nCount_RNA < 40000
retained_c_mito <- seu$percent_mito < 14
retained_c_ribo <- seu$percent_ribo < 3
keep_cells <- retained_c_mito & retained_c_ribo & retained_c_umi_low & retained_c_umi_high
scRNA_cells <- subset(seu, cells = Cells(seu)[keep_cells])
counts_rna <- GetAssayData(scRNA_cells, assay = "RNA", slot = "counts")
feature_rowsum <- Matrix::rowSums(counts_rna > 0)
# 基因阈值
retained_f_low <- feature_rowsum > ncol(scRNA_cells) * 0.005
genes_keep <- names(feature_rowsum)[retained_f_low]
counts_herv <- GetAssayData(scRNA_cells, assay = "HERV", slot = "counts")
# hERV位点：去掉--no-feature
herv_keep <- names(herv_rowsum)
herv_keep <- herv_keep[! herv_keep %in% c("--no-feature", "__no_feature")]
# 过滤
scRNA_cells[["RNA"]] <- subset(scRNA_cells[["RNA"]], features = genes_keep)
scRNA_cells[["HERV"]] <- subset(scRNA_cells[["HERV"]], features = herv_keep)
scRNA_cells
table(scRNA_cells$group)
table(scRNA_cells$orig.ident)
rna_counts <- GetAssayData(scRNA_cells, assay = "RNA", slot = "counts")
scRNA_cells$nCount_RNA <- Matrix::colSums(rna_counts)
scRNA_cells$nFeature_RNA <- Matrix::colSums(rna_counts > 0)
herv_counts <- GetAssayData(scRNA_cells, assay = "HERV", slot = "counts")
scRNA_cells$nCount_HERV <- Matrix::colSums(herv_counts)
scRNA_cells$nFeature_HERV <- Matrix::colSums(herv_counts > 0)
scRNA_cells$HERV_fraction <- scRNA_cells$nCount_HERV / (scRNA_cells$nCount_HERV + scRNA_cells$nCount_RNA)
# 过滤后
feats <- c("nFeature_RNA", "nCount_RNA")
p_filt_a_1 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_a_2 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
# HERV
feats <- c("nCount_HERV", "nFeature_HERV")
p_filt_a_3 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_a_4 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
p_filt_a_7 <- VlnPlot(
  scRNA_cells, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "group"
) + NoLegend()
p_filt_a_8 <- VlnPlot(
  scRNA_cells, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
feats <- c("percent_mito", "percent_ribo")
p_filt_a_5 <- VlnPlot(
  scRNA_cells, features = feats, 
  pt.size = 0, 
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_a_6 <- VlnPlot(
  scRNA_cells, features = feats, 
  pt.size = 0, 
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
pdf(file.path(res_dir, "after_filter.pdf"), width = 20, height = 20)
p <- (p_filt_a_1 / p_filt_a_2) | (p_filt_a_3 / p_filt_a_4) | (p_filt_a_5 / p_filt_a_6) | (p_filt_a_7 / p_filt_a_8)
print(p)
dev.off()
# 保存数据
saveRDS(
    scRNA_cells, 
    file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds", 
    compress = "xz"
)

SCTransform+integration：

seu <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds")
DefaultAssay(seu) <- "RNA"
obj.list <- SplitObject(seu, split.by = "orig.ident")  # 按样本拆分
obj.list <- lapply(obj.list, function(x) {  # 每个样本分别做SCTransform
  SCTransform(
    x,
    assay = "RNA",
    new.assay.name = "SCT",
    vars.to.regress = "percent_mito",
    verbose = FALSE,
    method = "glmGamPoi"
  )
})
features <- SelectIntegrationFeatures(  # 选integration features
  object.list = obj.list,
  nfeatures = 3000
)
obj.list <- PrepSCTIntegration(
  object.list = obj.list,
  anchor.features = features,
  verbose = FALSE
)
anchors <- FindIntegrationAnchors(  # 找anchors
  object.list = obj.list,
  normalization.method = "SCT",
  anchor.features = features,
  verbose = FALSE
)
seu <- IntegrateData(  # 整合
  anchorset = anchors,
  normalization.method = "SCT",
  verbose = FALSE
)
DefaultAssay(seu) <- "integrated"  # 用integrated assay做降维和聚类
seu <- RunPCA(seu, npcs = 50, verbose = FALSE)
pdf(file.path(res_dir, "ElbowPlot_all.pdf"))
p <- ElbowPlot(seu, ndims = 50)
print(p)
dev.off()
pc.use <- 1:30  # 选主成分数量
seu <- RunUMAP(seu, dims = pc.use, verbose = FALSE)
p_dim_group <- DimPlot(seu, group.by = "group")
p_dim_gsm <- DimPlot(seu, group.by = "orig.ident", raster = TRUE)
seu <- FindNeighbors(seu, dims = pc.use, verbose = FALSE)
seu <- FindClusters(seu, resolution = c(0.01, 0.05, 0.1, 0.2, 0.5))
p_cutree <- clustree(seu@meta.data, prefix = "RNA_snn_res.")
resolution <- "RNA_snn_res.0.05"  # 选分辨率
Idents(seu) <- seu[[resolution]]
p_dim_cluster <- DimPlot(seu, label = TRUE)
pdf(file.path(res_dir, "data_preprocess.pdf"), width = 16, height = 16)
p <- (p_dim_group | p_dim_gsm) / (p_cutree | p_dim_cluster)
print(p)
dev.off()

细胞类型注释：

DefaultAssay(seu) <- "SCT"
# marker基因
gene_list <- list(
  Astro = c("AQP4", "ADGRV1", "GPC5", "RYR3"),
  Endo  = c("CLDN5", "ABCB1", "EBF1"),
  Excit = c("CAMK2A", "CBLN2", "LDB2"),
  Inhib = c("GAD1", "LHFPL3", "PCDH15"),
  Mic   = c("C3", "LRMDA", "DOCK8"),
  Oligo = c("MBP", "PLP1", "ST18")
)
# 在各个聚类中的表达量
pdf(file.path(res_dir, "cluster_marker_gene.pdf"), width = 16, height = 12)
p <- DotPlot(seu, features = gene_list) +
  theme(axis.text.x = element_text(angle = 60, vjust = 0.8, hjust = 0.8))
print(p)
dev.off()
# 计算每个cluster对每个细胞类型的marker-set的平均表达
score_by_cluster <- function(seu, gene_list, group_col) {
  out <- lapply(names(gene_list), function(ct) {
    feats <- intersect(gene_list[[ct]], rownames(seu))
    if (length(feats) == 0) return(NULL)
    avg <- AverageExpression(
      seu,
      assays = "RNA",
      features = feats,
      group.by = group_col,
      slot = "data",
      verbose = FALSE
    )$RNA  # features x clusters
    data.frame(
      cluster = colnames(avg),
      celltype = ct,
      score = colMeans(avg, na.rm = TRUE),
      stringsAsFactors = FALSE
    )
  }) |> bind_rows()
  tab <- out |>
    pivot_wider(names_from = celltype, values_from = score) |>
    arrange(as.numeric(as.character(cluster)))
  mat <- as.matrix(tab[, intersect(names(gene_list), colnames(tab)), drop = FALSE])
  tab$pred <- colnames(mat)[max.col(mat, ties.method = "first")]
  tab
}
score_tab <- score_by_cluster(seu, gene_list, resolution)
View(score_tab)
# 指定细胞类型
cl_Astro <- c("2")
cl_Endo  <- c("9")
cl_Mic   <- c("7")
cl_Oligo <- c("0")
cl_Inhib <- c("4", "5", "6", "11") 
# 其它细胞归为Excit
cl <- as.character(seu[[resolution]][, 1])
seu$celltype <- ifelse(cl %in% cl_Astro, "Astro",
                  ifelse(cl %in% cl_Endo,  "Endo",
                    ifelse(cl %in% cl_Mic,   "Mic",
                      ifelse(cl %in% cl_Oligo, "Oligo",
                        ifelse(cl %in% cl_Inhib, "Inhib", "Excit")))))
seu$celltype <- factor(seu$celltype, levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo"))
Idents(seu) <- seu$celltype
table(seu$celltype, seu$group)
# 保存数据
saveRDS(
    seu, 
    file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_celltype.rds",
    compress = "xz"
)
# 画图展示
ct_cols <- c(
  Astro = "#1b9e77",
  Endo  = "#d95f02",
  Excit = "#7570b3",
  Inhib = "#e7298a",
  Mic   = "#66a61e",
  Oligo = "#e6ab02"
)
dim_plot <- DimPlot(  # 细胞类型在umap上的分布
  seu, 
  reduction = "umap", 
  cols = ct_cols, 
  label = FALSE
)
ct_stat <- as.data.frame(table(seu$celltype))  # 统计各细胞类型的细胞数和比例
colnames(ct_stat) <- c("celltype", "total")
ct_stat$celltype <- factor(ct_stat$celltype, levels = levels(seu@active.ident))
ct_stat$percentage <- 100 * ct_stat$total / sum(ct_stat$total)
celltype_bar <- ggplot(ct_stat, aes(x = celltype, y = percentage, fill = celltype)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = ct_cols) +
  theme_classic() +
  theme(axis.title.x = element_blank(), legend.position = "none") +
  ylab("Proportion of total nuclei (%)")
genes_paper <- c(  # 各细胞类型marker热图
  "SLC1A2","ADGRV1","GPC5","RYR3","GFAP",
  "CLDN5","FLT1","ABCB1","EBF1","MT2A",
  "RALYL","KCNIP4","CBLN2","LDB2","KCNQ5",
  "NXPH1","LHFPL3","PCDH15","GRIK1","ADARB2",
  "LRMDA","DOCK8","ARHGAP24","ARHGAP15","PLXDC2",
  "ST18","PLP1","CTNNA3","MBP","PIP4K2A"
)
genes_use <- intersect(genes_paper, rownames(seu))
# seu <- ScaleData(seu, features = genes_use, verbose = FALSE)
heatmap <- DoHeatmap(
  seu,
  features = genes_use,
  disp.min = -1.5,
  disp.max = 1.5,
  angle = 0,
  group.colors = unname(ct_cols)
) + scale_fill_gradientn(
  colors = c("#f100ef", "black", "#f0e442"),
  limits = c(-1.5, 1.5)
)
pdf(file.path(res_dir, "celltype.pdf"), width = 24, height = 16)
p <- dim_plot + celltype_bar / heatmap
print(p)
dev.off()

计算差异基因：

scRNA <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_celltype.rds")
DefaultAssay(scRNA) <- "SCT"
scRNA$group <- factor(scRNA$group, levels = c("NC","AD"))
scRNA$celltype <- factor(
  scRNA$celltype,
  levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo")
)
Idents(scRNA) <- scRNA$celltype
balance_cells <- function(obj, ct){
  sub <- subset(obj, idents = ct)
  ad <- WhichCells(sub, expression = group == "AD")
  nc <- WhichCells(sub, expression = group == "NC")
  if(length(ad) > 2000){
    n <- min(length(ad), length(nc))
    sub[, c(sample(ad, n), sample(nc, n))]
  } else {
    sub
  }
}
cts <- levels(scRNA$celltype)
obj_bal <- merge(x = balance_cells(scRNA, cts[1]),
                 y = lapply(cts[-1], \(ct) balance_cells(scRNA, ct)))
obj_bal$celltype <- factor(
  obj_bal$celltype,
  levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo")
)
obj_bal <- JoinLayers(obj_bal, assay = "SCT")
obj_bal$compare <- paste(obj_bal$group, obj_bal$celltype, sep = "_")
scRNA$compare <- paste(scRNA$group, scRNA$celltype, sep = "_")
ct <- levels(obj_bal$celltype)
all_markers <- lapply(ct, function(x){
  message("Running: ", x)
  FindMarkers(
    obj_bal,
    group.by = "compare",
    ident.1 = paste0("AD_", x),
    ident.2 = paste0("NC_", x),
    test.use = "MAST",
    latent.vars = c("nCount_RNA", "percent_mito")
    logfc.threshold = 0.25,
    min.pct = 0.1
  )
})
names(all_markers) <- ct
lapply(all_markers, nrow)
all_markers_sig <- lapply(all_markers, function(df){
  df2 <- subset(
    df, 
    p_val_adj < 0.1
  )
  return(df2)
})
View(all_markers_sig[["Astro"]])
save(all_markers_sig, file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_all_markers_sig.RData")

hERV按subfamily/class聚合+标准化：

# 构建位点-家族映射
get_attr <- function(x, key) {
  pat <- paste0(key, ' "([^"]+)"')
  m <- regexec(pat, x, perl = TRUE)
  rr <- regmatches(x, m)
  out <- rep(NA_character_, length(x))
  ok <- lengths(rr) >= 2
  out[ok] <- vapply(rr[ok], `[`, character(1), 2)
  out
}
gtf_path <- "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\metadata\\hERV.gtf"
lines <- readLines(gtf_path, warn = FALSE)
lines <- lines[!grepl("^\\s*#", lines)]
lines <- lines[nzchar(lines)]
gtf <- data.table::fread(
  text = lines,
  sep = "\t",
  header = FALSE,
  quote = "",
  fill = TRUE,
  data.table = TRUE
)
gtf <- gtf[, 1:9]
setnames(gtf, c("seqname","source","feature","start","end","score","strand","frame","attribute"))
gtf[, gene_id    := get_attr(attribute, "gene_id")]
gtf[, locus      := get_attr(attribute, "locus")]
gtf[, intModel   := get_attr(attribute, "intModel")]
gtf[, repName    := get_attr(attribute, "repName")]
gtf[, repFamily  := get_attr(attribute, "repFamily")]
gtf[, geneRegion := get_attr(attribute, "geneRegion")]
gene_dt <- gtf[feature == "gene", .(locus_id = gene_id, intModel)][!is.na(locus_id)]  # gene行：intModel（subfamily）
exon_dt <- gtf[feature == "exon", .(locus_id = gene_id, repFamily)][!is.na(locus_id)]  # exon行：repFamily（class）
repfam_dt <- exon_dt[!is.na(repFamily),  # 每个位点的class：取出现最多的repFamily
                     .(repFamily = {
                       x <- repFamily
                       ux <- unique(x); ux[which.max(tabulate(match(x, ux)))]
                     }),
                     by = locus_id]
map <- merge(gene_dt, repfam_dt, by = "locus_id", all = TRUE)
map[, subfamily := intModel]
map[, class := repFamily]
map$subfamily_raw <- map$subfamily
map$subfamily <- gsub("-int", "", map$subfamily)  # 去掉结尾的-int标记
herv_map <- unique(map[, .(locus_id, subfamily_raw, subfamily, class)])
write.csv(herv_map, file = "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\metadata\\hERV_locus2family.csv", row.names = F)
# 聚合
seu <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_celltype.rds")
herv_map <- read.csv("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\metadata\\hERV_locus2family.csv")
herv_map$locus_id <- gsub("_", "-", herv_map$locus_id)
herv_counts <- GetAssayData(seu, assay = "HERV", slot = "counts")
aggregate_herv_counts <- function(herv_counts, map, group_by = c("subfamily", "class")) {
  group_by <- match.arg(group_by)
  keep_loci <- intersect(rownames(herv_counts), map$locus_id)
  map2 <- map[match(keep_loci, map$locus_id), ]
  map2 <- map2[!is.na(map2[[group_by]]) & nzchar(map2[[group_by]]), ]
  mat <- herv_counts[map2$locus_id, , drop = FALSE]
  grp <- map2[[group_by]]
  f <- factor(grp, levels = unique(grp))
  G <- sparseMatrix(
    i = seq_along(f),
    j = as.integer(f),
    x = 1,
    dims = c(length(f), nlevels(f)),
    dimnames = list(rownames(mat), levels(f))
  )
  out <- Matrix::t(G) %*% mat
  out
}
herv_subfamily_counts <- aggregate_herv_counts(herv_counts, herv_map, group_by = "subfamily")
herv_class_counts <- aggregate_herv_counts(herv_counts, herv_map, group_by = "class")
# 将新计数矩阵的细胞与原来的对齐
align_to_cells <- function(mat, cells){
  mat <- mat[, cells, drop = FALSE]
  mat
}
cells <- colnames(seu)
herv_counts <- align_to_cells(herv_counts, cells)
herv_subfamily_counts <- align_to_cells(herv_subfamily_counts, cells)
herv_class_counts <- align_to_cells(herv_class_counts, cells)
# 加到seu上
replace_assay <- function(seu, assay_name, counts){
  counts <- as(counts, "dgCMatrix")
  seu[[assay_name]] <- CreateAssayObject(counts = counts)
  seu
}
seu <- replace_assay(seu, "HERV_family", herv_subfamily_counts)
seu <- replace_assay(seu, "HERV_class", herv_class_counts)
# 标准化
normalize_herv_by_rna_depth <- function(seu, assay, scale.factor = 10000) {
  rna_counts <- GetAssayData(seu, assay = "RNA",  slot = "counts")
  herv_counts <- GetAssayData(seu, assay = assay, slot = "counts")
  lib_rna <- Matrix::colSums(rna_counts)
  herv_norm <- t(t(herv_counts) / lib_rna) * scale.factor
  herv_norm@x <- log1p(herv_norm@x)
  seu <- SetAssayData(seu, assay = assay, slot = "data", new.data = herv_norm)
  return(seu)
}
seu <- normalize_herv_by_rna_depth(seu, "HERV")
seu <- normalize_herv_by_rna_depth(seu, "HERV_family")
seu <- normalize_herv_by_rna_depth(seu, "HERV_class")
# 简单验证一下
herv_subfamily_counts[, 1]
sum(herv_subfamily_counts[, 1])  # 都是45
herv_class_counts[, 1]
sum(herv_class_counts[, 1])
herv_counts[, 1]
sum(herv_counts[, 1])
herv_subfamily_counts[, 2]
sum(herv_subfamily_counts[, 2])  # 都是15
herv_class_counts[, 2]
sum(herv_class_counts[, 2])
herv_counts[, 2]
sum(herv_counts[, 2])
herv_data <- GetAssayData(seu, assay = "HERV", slot = "data")
herv_data[, 1]
HERV_family_data <- GetAssayData(seu, assay = "HERV_family", slot = "data")
HERV_family_data[, 1]
HERV_class_data <- GetAssayData(seu, assay = "HERV_class", slot = "data")
HERV_class_data[, 1]
# 保存数据
saveRDS(
    seu, 
    file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_final.rds",
    compress = "xz"
)