使用我的GSE157827数据进行初步分析

other-other

2025.12.31-2026.1.14研究进展

try1

构建Seurat对象

library(Seurat)
library(Matrix)
library(tidyverse)
library(readxl)
# 读取计数矩阵
data_root <- "/public/home/GENE_proc/wth/GSE157827/mtx/"
read_one_srr <- function(srr) {
  gene_dir <- file.path(data_root, srr, "gene")
  herv_dir <- file.path(data_root, srr, "hERV")
  gene_counts <- ReadMtx(
    mtx = file.path(gene_dir, "counts.mtx"),
    features = file.path(gene_dir, "features.tsv"),
    cells = file.path(gene_dir, "barcodes.tsv")
  )
  herv_counts <- ReadMtx(
    mtx = file.path(herv_dir, "counts.mtx"),
    features = file.path(herv_dir, "features.tsv"),
    cells = file.path(herv_dir, "barcodes.tsv"),
    feature.column = 1
  )
  colnames(gene_counts) <- paste(srr, colnames(gene_counts), sep = "_")
  colnames(herv_counts) <- paste(srr, colnames(herv_counts), sep = "_")
  common_cells <- intersect(colnames(gene_counts), colnames(herv_counts))
  gene_counts <- gene_counts[, common_cells, drop = FALSE]
  herv_counts <- herv_counts[, common_cells, drop = FALSE]
  seu <- CreateSeuratObject(
    counts = gene_counts,
    assay = "RNA",
    project = "AD_hERV"
  )
  seu[["HERV"]] <- CreateAssayObject(counts = herv_counts)
  seu$SRR_id <- srr
  return(seu)
}
srr_ids <- list.dirs(data_root, full.names = FALSE, recursive = FALSE)
seu_list <- list()
for (srr in srr_ids) {
  seu <- read_one_srr(srr)
  if (!is.null(seu)) {
    seu_list[[srr]] <- seu
  }
}
seu <- Reduce(function(x, y) merge(x, y), seu_list)
rm(seu_list)
head(seu@meta.data)
# 添加metadata
meta_map <- read_xlsx("/public/home/GENE_proc/wth/GSE157827/raw_data/metadata/metadata.xlsx")
sample_info <- read_xlsx("/public/home/GENE_proc/wth/GSE157827/raw_data/metadata/sample_info.xlsx")
names(meta_map) <- make.names(names(meta_map))
names(sample_info) <- make.names(names(sample_info))
meta_map <- meta_map %>%
  select(individuals, SRR_id, sample_id, tissue)
colnames(meta_map) <- c("sample_id", "SRR_id", "GSM", "tissue")
metadata <- meta_map %>%
  left_join(sample_info, by = "sample_id")
cell_meta <- seu@meta.data %>%
  rownames_to_column("cell")
cell_meta2 <- cell_meta %>%
  left_join(metadata, by = "SRR_id")
sum(is.na(cell_meta2$sample_id))  # 检查是否有映射失败
new_cols <- setdiff(names(cell_meta2), names(seu@meta.data))
meta_to_add <- cell_meta2 %>%
  select(cell, all_of(new_cols)) %>%
  column_to_rownames("cell")
seu <- AddMetaData(seu, metadata = meta_to_add)
# 合并每个layers
seu <- JoinLayers(seu, assay = "RNA")
Layers(seu, assay = "RNA")  # 检查一下layers
# 线粒体/核糖体比例
seu <- PercentageFeatureSet(seu, pattern = "^MT-", col.name = "percent_mito")
seu <- PercentageFeatureSet(seu, pattern = "^RP[SL]", col.name = "percent_ribo")
# 保存数据
saveRDS(
  seu, 
  file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827.rds", 
  compress = "xz"
)

> seu
An object of class Seurat 
93660 features across 304953 samples within 2 assays 
Active assay: RNA (78691 features, 0 variable features)
 1 layer present: counts
 1 other assay present: HERV

质控

看数据分布

scRNA <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827.rds")
res_dir <- "/public/home/GENE_proc/wth/GSE157827/res/"
scRNA$orig.ident <- scRNA$sample_id
scRNA$group <- factor(scRNA$diagnosis, levels = c("NC", "AD"))
# 基因数/UMI数
feats <- c("nFeature_RNA", "nCount_RNA")
p_filt_b_1 <- VlnPlot(
  scRNA, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_b_2 <- VlnPlot(
  scRNA, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
# HERV数
feats <- c("nCount_HERV", "nFeature_HERV")
p_filt_b_3 <- VlnPlot(
  scRNA, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_b_4 <- VlnPlot(
  scRNA, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
# HERV占整个转录本的比例
scRNA$HERV_fraction <- scRNA$nCount_HERV / (scRNA$nCount_HERV + scRNA$nCount_RNA)
p_filt_b_7 <- VlnPlot(
  scRNA, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "group"
) + NoLegend()
p_filt_b_8 <- VlnPlot(
  scRNA, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
# 线粒体/核糖体比例
feats <- c("percent_mito", "percent_ribo")
p_filt_b_5 <- VlnPlot(
  scRNA, features = feats, 
  pt.size = 0, 
  ncol = 2,
  group.by = "group"  # 按AD/NC分组看分布
) + NoLegend()
p_filt_b_6 <- VlnPlot(
  scRNA, features = feats, 
  pt.size = 0, 
  ncol = 1,
  group.by = "orig.ident"  # 按样本看分布
) + NoLegend()
# 拼图展示
pdf(file.path(res_dir, "before_filter.pdf"), width = 20, height = 20)
p <- (p_filt_b_1 / p_filt_b_2) | (p_filt_b_3 / p_filt_b_4) | (p_filt_b_5 / p_filt_b_6) | (p_filt_b_7 / p_filt_b_8)
print(p)
dev.off()

my_GSE157827_1

> summary(scRNA$nFeature_RNA)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     16     799    1203    1414    1854   10993 
> summary(scRNA$nCount_RNA)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    144    1099    1662    2239    2728   68430 
> summary(scRNA$nCount_HERV)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    5.00   11.00   14.07   19.00  223.00 
> summary(scRNA$nFeature_HERV)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    5.00    9.00   11.46   15.00  147.00 
> summary(scRNA$HERV_fraction)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000000 0.004047 0.006073 0.006409 0.008417 0.050000 
> summary(scRNA$percent_mito)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   1.382   3.100  10.467   8.076  99.093 
> summary(scRNA$percent_ribo)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.4058  0.5969  0.7382  0.8642 22.7354 

确定过滤阈值

细胞阈值：

# 过滤阈值
retained_c_umi_low <- scRNA$nFeature_RNA > 900
retained_c_umi_high <- scRNA$nFeature_RNA < 6000
retained_c_count_low  <- scRNA$nCount_RNA > 1200
retained_c_count_high <- scRNA$nCount_RNA < 30000
retained_c_mito <- scRNA$percent_mito < 10
retained_c_ribo <- scRNA$percent_ribo < 3
keep_cells <- retained_c_umi_low & retained_c_umi_high & retained_c_mito & retained_c_ribo
# 看看交集
df_fail <- data.frame(
  cell = Cells(scRNA),
  low_nFeature = !retained_c_umi_low,
  high_nFeature = !retained_c_umi_high,
  low_nCount = !retained_c_count_low,
  high_nCount = !retained_c_count_high,
  high_mito = !retained_c_mito,
  high_ribo = !retained_c_ribo
)
pdf(file.path(res_dir, "cell_filter_upset.pdf"), width = 12, height = 6)
ComplexUpset::upset(
  df_fail,
  intersect = c("low_nFeature","high_nFeature","low_nCount","high_nCount","high_mito","high_ribo"),
  base_annotations = list('Intersection size' = intersection_size())
)
dev.off()

my_GSE157827_2

基因阈值：

scRNA_cells <- subset(scRNA, cells = Cells(scRNA)[keep_cells])
counts_rna <- GetAssayData(scRNA_cells, assay = "RNA", slot = "counts")
feature_rowsum <- Matrix::rowSums(counts_rna > 0)
retained_f_low <- feature_rowsum > ncol(scRNA_cells) * 0.005  # 只保留“至少在0.5%细胞里表达过”的基因
genes_keep <- names(feature_rowsum)[retained_f_low]
# rankplot辅助看阈值
rankplot <- data.frame(
  feature_count = sort(feature_rowsum),
  gene = names(sort(feature_rowsum)),
  Rank = seq_along(feature_rowsum)
)
library(ggbreak)
pdf(file.path(res_dir, "gene_filter.pdf"), width = 12, height = 8)
p_gene <- ggplot(rankplot, aes(x = Rank, y = feature_count)) +
  geom_point() +
  scale_y_break(c(10000, 100000)) +
  geom_hline(yintercept = ncol(scRNA_cells) * 0.005, color = "red") +
  geom_text(x = 10000, y = 4000, size = 5,
            label = "Feature cutoff : ncol(scRNA)*0.5%")
print(p_gene)
dev.off()

my_GSE157827_3

hERV阈值：

counts_herv <- GetAssayData(scRNA_cells, assay = "HERV", slot = "counts")
herv_rowsum <- Matrix::rowSums(counts_herv > 0)
herv_keep <- names(herv_rowsum)[herv_rowsum > 50]  # 至少50个细胞中表达过
cat("HERV features before:", nrow(counts_herv), "\n")  # 14969
cat("HERV features kept:", length(herv_keep), "\n")  # 3351
rankplot_herv <- data.frame(
  feature_count = sort(herv_rowsum),
  feature = names(sort(herv_rowsum)),
  Rank = seq_along(herv_rowsum)
)
pdf(file.path(res_dir, "gene_filter_herv.pdf"), width = 10, height = 5)
ggplot(rankplot_herv, aes(x = Rank, y = feature_count)) +
  geom_point(size = 0.4) +
  geom_hline(yintercept = herv_cutoff, color = "red") +
  xlab("Rank") + ylab("Cells with counts>0 (per HERV feature)")
dev.off()

my_GSE157827_4

过滤

scRNA_cells[["RNA"]] <- subset(scRNA_cells[["RNA"]], features = genes_keep)
scRNA_cells[["HERV"]] <- subset(scRNA_cells[["HERV"]], features = herv_keep)
# 重新计算nCount和nFeature
rna_counts <- GetAssayData(scRNA_cells, assay = "RNA", slot = "counts")
scRNA_cells$nCount_RNA <- Matrix::colSums(rna_counts)
scRNA_cells$nFeature_RNA <- Matrix::colSums(rna_counts > 0)
herv_counts <- GetAssayData(scRNA_cells, assay = "HERV", slot = "counts")
scRNA_cells$nCount_HERV <- Matrix::colSums(herv_counts)
scRNA_cells$nFeature_HERV <- Matrix::colSums(herv_counts > 0)
scRNA_cells$HERV_fraction <- scRNA_cells$nCount_HERV / (scRNA_cells$nCount_HERV + scRNA_cells$nCount_RNA)
# 查看过滤后分布
# 基因数/UMI数
feats <- c("nFeature_RNA", "nCount_RNA")
p_filt_a_1 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_a_2 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
# HERV数
feats <- c("nCount_HERV", "nFeature_HERV")
p_filt_a_3 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_a_4 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
p_filt_a_7 <- VlnPlot(
  scRNA_cells, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "group"
) + NoLegend()
p_filt_a_8 <- VlnPlot(
  scRNA_cells, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
# 线粒体/核糖体比例
feats <- c("percent_mito", "percent_ribo")
p_filt_a_5 <- VlnPlot(
  scRNA_cells, features = feats, 
  pt.size = 0, 
  ncol = 2,
  group.by = "group"  # 按AD/NC分组看分布
) + NoLegend()
p_filt_a_6 <- VlnPlot(
  scRNA_cells, features = feats, 
  pt.size = 0, 
  ncol = 1,
  group.by = "orig.ident"  # 按样本看分布
) + NoLegend()
# 拼图展示
pdf(file.path(res_dir, "after_filter.pdf"), width = 20, height = 20)
p <- (p_filt_a_1 / p_filt_a_2) | (p_filt_a_3 / p_filt_a_4) | (p_filt_a_5 / p_filt_a_6) | (p_filt_a_7 / p_filt_a_8)
print(p)
dev.off()
# 保存数据
saveRDS(
    scRNA_cells, 
    file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds", 
    compress = "xz"
)

my_GSE157827_5

> dim(scRNA_cells)
[1]  18217 189086
> table(scRNA_cells$group)
    NC     AD 
100788  88298 
> table(scRNA_cells$orig.ident)
  AD1  AD10  AD13  AD19   AD2  AD20  AD21   AD4   AD5   AD8   AD9  NC11  NC12 
 9802 19632  1128  4623 14677 11959  3708  3147 11639   592  7391   171 18635 
 NC14  NC15  NC16  NC17  NC18   NC3   NC7 
18437 13877   133 16229 14671  8078 10557 

可以看到似乎有的样本细胞数非常少，不过查看作者的计数矩阵，似乎也是有一些样本的细胞数很少。我的结果中AD8、NC11、NC16在作者的计数矩阵中也是属于比较少的一堆，而且我设置的阈值比作者的更严格，所以细胞数就更少了

> table(author_scRNA$orig.ident)
  NC3   NC7  NC11  NC12  NC14  NC15  NC16  NC17  NC18   AD1   AD2   AD4   AD5 
 6218  6372  1587 15568 10932  8385  4235 14263 11476  6173 15276  5980 12356 
  AD6   AD8   AD9  AD10  AD13  AD19  AD20  AD21 
  311  2405  9789 16013  1608  3614  9169  7786 

与作者的计数矩阵对照

library(Matrix)
seu_ours <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds")
seu_auth <- readRDS("/public/home/GENE_proc/wth/GSE157827/data/GSE157827_filt.rds")
res_dir <- "/public/home/GENE_proc/wth/GSE157827/res/"
seu_ours$orig.ident <- seu_ours$sample_id
seu_ours$group <- factor(seu_ours$diagnosis, levels = c("NC", "AD"))
meta_ours <- seu_ours@meta.data %>%
  rownames_to_column("cell_ours")
meta_ours <- meta_ours %>%
  mutate(
    barcode = str_replace(cell_ours, "^.*_", ""),
    GSM = as.character(GSM),
    sample = as.character(orig.ident),
    cell_like_auth = paste0(GSM, "_", sample, "_", barcode, "-1"),
    cell_like_auth_no1 = str_replace(cell_like_auth, "-1$", "")
  )
auth_cells_raw <- colnames(seu_auth)
auth_cells_no1 <- str_replace(auth_cells_raw, "-1$", "")
our_ids_no1 <- meta_ours$cell_like_auth_no1
# 看看我的细胞和作者的细胞有多少是相同的
common_no1 <- intersect(our_ids_no1, auth_cells_no1)
cat("ours cells:", length(our_ids_no1), "\n")
cat("auth cells:", length(auth_cells_no1), "\n")
cat("common cells:", length(common_no1), "\n")
cat("overlap % (of ours):", round(length(common_no1)/length(our_ids_no1)*100, 2), "%\n")
cat("overlap % (of auth):", round(length(common_no1)/length(auth_cells_no1)*100, 2), "%\n")

很奇怪，这步的重叠率只有大概25%，即使我拿原始计数矩阵也只有大约30%

在重叠的细胞上比较计数的一致性：

mat_ours <- GetAssayData(seu_ours, assay = "RNA", slot = "counts")
mat_auth <- GetAssayData(seu_auth, assay = "RNA", slot = "counts")
our_map <- setNames(meta_ours$cell_ours, meta_ours$cell_like_auth_no1)
our_cells_common <- unname(our_map[common_no1])
auth_map <- setNames(auth_cells_raw, auth_cells_no1)
auth_cells_common <- unname(auth_map[common_no1])
common_genes <- intersect(rownames(mat_ours), rownames(mat_auth))
mat_ours_sub <- mat_ours[common_genes, our_cells_common, drop = FALSE]
mat_auth_sub <- mat_auth[common_genes, auth_cells_common, drop = FALSE]
cat("common genes:", length(common_genes), "\n")

每个细胞的总UMI：

our_lib  <- Matrix::colSums(mat_ours_sub)
auth_lib <- Matrix::colSums(mat_auth_sub)
cor_lib <- cor(log10(our_lib + 1), log10(auth_lib + 1))
cat("cell library size corr (log10+1):", cor_lib, "\n")  # 0.922925
pdf(file.path(res_dir, "author_our_umi.pdf"), width = 10, height = 10)
plot(log10(auth_lib + 1), log10(our_lib + 1),
     xlab = "author log10(lib+1)", ylab = "ours log10(lib+1)", pch = 16, cex = 0.4)
abline(0, 1, col = "red")
dev.off()

my_GSE157827_6

每个基因的总counts：

our_gene_sum  <- Matrix::rowSums(mat_ours_sub)
auth_gene_sum <- Matrix::rowSums(mat_auth_sub)
cor_gene <- cor(log10(our_gene_sum + 1), log10(auth_gene_sum + 1))
cat("gene total counts corr (log10+1):", cor_gene, "\n")  # 0.7527359
pdf(file.path(res_dir, "author_our_counts.pdf"), width = 10, height = 10)
plot(log10(auth_gene_sum + 1), log10(our_gene_sum + 1),
     xlab = "author log10(geneSum+1)", ylab = "ours log10(geneSum+1)", pch = 16, cex = 0.4)
abline(0, 1, col = "red")
dev.off()

my_GSE157827_7

每个细胞的基因向量相关性（随机抽一些细胞）：

set.seed(1)
k <- min(200, ncol(mat_ours_sub))
cells_pick <- sample(seq_len(ncol(mat_ours_sub)), k)
cors <- sapply(cells_pick, function(j){
  x <- as.numeric(mat_ours_sub[, j])
  y <- as.numeric(mat_auth_sub[, j])
  cor(log1p(x), log1p(y))
})
summary(cors)
pdf(file.path(res_dir, "author_our_corr.pdf"), width = 10, height = 10)
hist(cors, breaks = 30, main = "Per-cell cor(log1p counts)", xlab = "cor")
dev.off()

my_GSE157827_8

可以看到在重叠的细胞内，我的计数结果和作者的基本一致

为了看一看是什么原因导致细胞名不一致，我又按样本（GSM）分别看了重叠率，发现所有样本的重叠率都在20%-60%区间，没有说某几个样本完全重叠/不重叠的情况，说明barcode不匹配是发生在所有样本上的

询问了GPT，除了之前反复提过的STARsolo和cellranger计数上的区别，很有可能作者提供的计数矩阵和fastq数据不完全匹配——比如一个GSM对应多个SRR（技术重复/不同lane），我是计数了全部SRR，并且最后直接进行合并，作者可能只取了部分/合并时处理方式不同（比如过滤掉某些样本），这也可以解释为什么上面过滤时，我在第一次尝试时，虽然使用了和作者一样（甚至更严格）的阈值，但最后的细胞数却多了近1/3（24w-17w），后面才不得不逐渐提高阈值来让细胞数和作者的接近

为了验证这个想法，我按SRR分组计算重叠率，发现确实在SRR层面，重叠率差异很大，有些接近100%，有些只有30%-5%，这样的结果就很奇怪

之后我又看了“同一个GSM内，一个barcode会不会出现在多个SRR中”，结果发现很多barcode在GSM内出现次数都近似于该GSM包含的SRR数，就是说，这里的每个SRR是同一文库的lane拆分，而我在计数时把它们都当成独立run进行计数了

最后只能重跑一篇计数流程，把属于同一GSM的多个SRR当作多lane输入给STAR

不能直接把多个SRR的计数矩阵相加，因为UMI的去重是在计数阶段，直接相加会把跨lane的相同UMI叠加，产生系统性偏差

try2

STAR计数

先做一个SRR->GSM的映射srr2gsm.csv

my_GSE157827_9

与上一版本的代码相比，除了先把SRR都映射到GSM上

--soloUMIlen 10：作者的GSM说明里写的是Chromium Single Cell 3′ Library Kit v3，按理论来讲v3的UMI长度应该是12（官方pdf是这么说的），我之前写成了10，但实测改为12之后反而是错的
--soloFeatures Gene GeneFull：打开pre-mRNA计数，后续都用GeneFull中的结果
--soloCellFilter EmptyDrops_CR：让cell calling策略接近CellRanger（使用更像CellRanger的细胞筛选方法）

workDir=/public/home/GENE_proc/wth/GSE157827/
genomeDir=/public/home/wangtianhao/Desktop/STAR_ref/hg38/
fqDir=${workDir}/fastq
mapFile=${fqDir}/srr2gsm.csv
whitelist=/public/home/wangtianhao/Desktop/STAR_ref/whitelist/3M-february-2018.txt
hERV_gtf=/public/home/wangtianhao/Desktop/STAR_ref/transcripts.gtf
res_barcodes=barcodes
res_features=features
res_counts=counts
cd ${workDir}
module load miniconda3/base
dos2unix ${mapFile}
# 取所有GSM
GSM_LIST=$(cut -d',' -f2 "${mapFile}" | sort -u)
for GSM in ${GSM_LIST}; do
    GSM=${GSM//$'\r'/}
    echo "=============================="
    echo "[GSM] ${GSM}"
    SRR_LIST=$(awk -F',' -v g="${GSM}" '$2==g{print $1}' "${mapFile}")
    nSRR=$(echo "${SRR_LIST}" | wc -l | awk '{print $1}')
    echo "  SRR count: ${nSRR}"
    echo "${SRR_LIST}" | sed 's/^/  - /'
    cdna_files=$(echo "${SRR_LIST}" | sed "s|^|${fqDir}/|; s|$|_2.fastq.gz|" | paste -sd, -)
    bcumi_files=$(echo "${SRR_LIST}" | sed "s|^|${fqDir}/|; s|$|_1.fastq.gz|" | paste -sd, -)
    for f in $(echo "${cdna_files}" | tr ',' ' ') $(echo "${bcumi_files}" | tr ',' ' '); do
        if [[ ! -f "${f}" ]]; then
        echo "[ERROR] Missing fastq: ${f}" >&2
        exit 1
        fi
    done
    # STARsolo + stellarscope
    conda activate STAR
    mkdir -p star
    STAR \
        --runMode alignReads \
        --runThreadN 16 \
        --genomeDir ${genomeDir} \
        --readFilesIn ${cdna_files} ${bcumi_files} \
        --readFilesCommand zcat \
        --outFileNamePrefix star/${GSM} \
        --soloType CB_UMI_Simple \
        --soloCBstart 1 \
        --soloCBlen 16 \
        --soloUMIstart 17 \
        --soloUMIlen 10 \
        --soloBarcodeReadLength 0 \
        --soloCBwhitelist ${whitelist} \
        --soloFeatures Gene GeneFull \
        --clipAdapterType CellRanger4 \
        --soloCellFilter EmptyDrops_CR \
        --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \
        --soloUMIfiltering MultiGeneUMI_CR \
        --soloUMIdedup 1MM_CR \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMattributes NH HI nM AS CR UR CB UB GX GN sS sQ sM \
        --outSAMunmapped Within \
        --outFilterScoreMin 30 \
        --limitOutSJcollapsed 5000000 \
        --outFilterMultimapNmax 500 \
        --outFilterMultimapScoreRange 5
    conda deactivate
    conda activate stellarscope
    mkdir -p stellarscope/${GSM}
    mkdir -p tmp/${GSM}
    samtools view -@1 -u -F 4 \
        -D CB:<(tail -n+1 star/${GSM}Solo.out/GeneFull/filtered/barcodes.tsv) \
        star/${GSM}Aligned.sortedByCoord.out.bam \
        | samtools sort -@16 -n -t CB -T ./tmp/${GSM} \
        > stellarscope/${GSM}/Aligned.sortedByCB.bam
    stellarscope assign \
        --outdir stellarscope/${GSM} \
        --nproc 16 \
        --stranded_mode F \
        --whitelist star/${GSM}Solo.out/GeneFull/filtered/barcodes.tsv \
        --pooling_mode individual \
        --reassign_mode best_exclude \
        --max_iter 500 \
        --updated_sam \
        stellarscope/${GSM}/Aligned.sortedByCB.bam \
        ${hERV_gtf}
    conda deactivate
    mkdir -p mtx/${GSM}/gene
    cp star/${GSM}Solo.out/GeneFull/filtered/barcodes.tsv mtx/${GSM}/gene/${res_barcodes}.tsv
    cp star/${GSM}Solo.out/GeneFull/filtered/features.tsv mtx/${GSM}/gene/${res_features}.tsv
    cp star/${GSM}Solo.out/GeneFull/filtered/matrix.mtx mtx/${GSM}/gene/${res_counts}.mtx
    mkdir -p mtx/${GSM}/hERV
    cp stellarscope/${GSM}/stellarscope-barcodes.tsv mtx/${GSM}/hERV/${res_barcodes}.tsv
    cp stellarscope/${GSM}/stellarscope-features.tsv mtx/${GSM}/hERV/${res_features}.tsv
    cp stellarscope/${GSM}/stellarscope-TE_counts.mtx mtx/${GSM}/hERV/${res_counts}.mtx
done
rm -rf ./tmp
echo "[DONE] GSM-level counting finished."
du -sh ./mtx

构建Seurat对象

唯一差别就是细胞命名，这里我直接和作者的对齐了

library(Seurat)
library(Matrix)
library(tidyverse)
library(readxl)
data_root <- "/public/home/GENE_proc/wth/GSE157827/mtx/"
meta_file <- "/public/home/GENE_proc/wth/GSE157827/raw_data/metadata/metadata.xlsx"
sample_file <- "/public/home/GENE_proc/wth/GSE157827/raw_data/metadata/sample_info.xlsx"
# 读取metadata
meta_map <- read_xlsx(meta_file)
sample_info <- read_xlsx(sample_file)
names(meta_map) <- make.names(names(meta_map))
names(sample_info) <- make.names(names(sample_info))
gsm_meta <- meta_map %>%  # GSM级别信息
  transmute(
    GSM = as.character(sample_id),
    group = as.character(diagnosis),
    orig.ident = as.character(individuals),
    tissue = as.character(tissue)
  ) %>%
  distinct(GSM, .keep_all = TRUE)
donor_info <- sample_info %>%  # 样本信息
  mutate(orig.ident = as.character(sample_id)) %>%
  select(-sample_id)
gsm_info <- gsm_meta %>%  # 合并
  left_join(donor_info, by = "orig.ident")
# 检查有无NA
any(is.na(gsm_info$orig.ident))
sum(is.na(gsm_info[[names(donor_info)[1]]]))
nrow(gsm_info)
gsm2indv <- setNames(gsm_info$orig.ident, gsm_info$GSM)  # GSM -> orig.ident
# 读取计数矩阵
normalize_barcode <- function(x) {  # 如果已经有`-数字`后缀（比如`-1`）就保留，否则补`-1`
  ifelse(grepl("-\\d+$", x), x, paste0(x, "-1"))
}
read_one_gsm <- function(gsm, gsm_info, data_root) {
  gene_dir <- file.path(data_root, gsm, "gene")
  herv_dir <- file.path(data_root, gsm, "hERV")
  # 读取矩阵
  gene_counts <- ReadMtx(
    mtx = file.path(gene_dir, "counts.mtx"),
    features = file.path(gene_dir, "features.tsv"),
    cells = file.path(gene_dir, "barcodes.tsv"),
    unique.features = TRUE
  )
  herv_counts <- ReadMtx(
    mtx = file.path(herv_dir, "counts.mtx"),
    features = file.path(herv_dir, "features.tsv"),
    cells = file.path(herv_dir, "barcodes.tsv"),
    feature.column = 1,
    unique.features = TRUE
  )
  # 取该GSM的样本信息
  info <- gsm_info %>% filter(GSM == gsm)
  indv <- info$orig.ident
  # 生成作者风格cellname：GSM_individuals_barcode-1
  gene_bc <- normalize_barcode(colnames(gene_counts))
  herv_bc <- normalize_barcode(colnames(herv_counts))
  colnames(gene_counts) <- paste(gsm, indv, gene_bc, sep = "_")
  colnames(herv_counts) <- paste(gsm, indv, herv_bc, sep = "_")
  # 对齐两套assay的细胞集合
  common_cells <- intersect(colnames(gene_counts), colnames(herv_counts))
  gene_counts <- gene_counts[, common_cells, drop = FALSE]
  herv_counts <- herv_counts[, common_cells, drop = FALSE]
  # 构建Seurat对象
  seu <- CreateSeuratObject(
    counts = gene_counts,
    assay = "RNA",
    project = "GSE157827"
  )
  seu[["HERV"]] <- CreateAssayObject(counts = herv_counts)
  # 添加metadata
  seu$GSM <- gsm
  seu$orig.ident <- indv
  seu$group <- info$group
  seu$tissue <- info$tissue
  extra_cols <- setdiff(colnames(info), c("GSM", "group", "orig.ident", "tissue"))
  for (cc in extra_cols) {
    seu[[cc]] <- info[[cc]][1]
  }
  return(seu)
}
gsm_ids <- list.dirs(data_root, full.names = FALSE, recursive = FALSE)
seu_list <- list()
for (gsm in gsm_ids) {
  tmp <- read_one_gsm(gsm, gsm_info = gsm_info, data_root = data_root)
  if (!is.null(tmp)) {
    seu_list[[gsm]] <- tmp
  }
}
seu <- Reduce(function(x, y) merge(x, y), seu_list)
rm(seu_list)
# 合并每个layers
seu <- JoinLayers(seu, assay = "RNA")
Layers(seu, assay = "RNA")  # 检查一下layers
# 线粒体/核糖体比例
seu <- PercentageFeatureSet(seu, pattern = "^MT-", col.name = "percent_mito")
seu <- PercentageFeatureSet(seu, pattern = "^RP[SL]", col.name = "percent_ribo")
seu$HERV_fraction <- seu$nCount_HERV / (seu$nCount_HERV + seu$nCount_RNA)
# 查看结果情况
seu
head(seu@meta.data)
table(seu$orig.ident)
table(seu$GSM)
seu$group <- factor(seu$group, levels = c("NC", "AD"))
# 保存数据
saveRDS(
  seu, 
  file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827.rds", 
  compress = "xz"
)

> seu
An object of class Seurat 
93660 features across 181653 samples within 2 assays 
Active assay: RNA (78691 features, 0 variable features)
 1 layer present: counts
 1 other assay present: HERV
> table(seu$orig.ident)
  AD1  AD10  AD13  AD19   AD2  AD20  AD21   AD4   AD5   AD6   AD8   AD9  NC11  NC12  NC14  NC15  NC16 
 6165 16437  1841  3898 15607  9430  8736  5895 12623  3204  2679 10269  3560 15727 10951  8492  6899 
 NC17  NC18   NC3   NC7 
14572 11821  6439  6408 

质控

过滤前

library(Seurat)
library(tidyverse)
library(patchwork)
library(Matrix)
seu <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827.rds")
res_dir <- "/public/home/GENE_proc/wth/GSE157827/res/"
feats <- c("nFeature_RNA", "nCount_RNA")
p_filt_b_1 <- VlnPlot(
  seu, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_b_2 <- VlnPlot(
  seu, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
feats <- c("percent_mito", "percent_ribo")
p_filt_b_3 <- VlnPlot(
  seu, features = feats, 
  pt.size = 0, 
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_b_4 <- VlnPlot(
  seu, features = feats, 
  pt.size = 0, 
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
pdf(file.path(res_dir, "before_filter.pdf"))
p <- (p_filt_b_1 / p_filt_b_2) | (p_filt_b_3 / p_filt_b_4)
print(p)
dev.off()

> fivenum(seu@meta.data$nFeature_RNA)
[1]    38   948  1618  2783 14543
> fivenum(seu@meta.data$nCount_RNA)
[1]    450   1367   2697   5627 118970
> fivenum(seu@meta.data$nCount_HERV)
[1]   0   3   8  16 375
> fivenum(seu@meta.data$nFeature_HERV)
[1]   0   3   6  13 191
> fivenum(seu@meta.data$HERV_fraction)
[1] 0.000000000 0.001904762 0.002690704 0.003473227 0.022727273
> fivenum(seu@meta.data$percent_mito)
[1]  0.0000000  0.4662729  1.0680017  2.6499303 96.6913580
> fivenum(seu@meta.data$percent_ribo)
[1]  0.0000000  0.2367424  0.3400397  0.5042546 15.1050420

my_GSE157827_10

设置阈值

# 细胞阈值
retained_c_umi_low <- seu$nFeature_RNA > 500
retained_c_umi_high <- seu$nFeature_RNA < 8000
retained_c_count_low  <- seu$nCount_RNA > 1000
retained_c_count_high <- seu$nCount_RNA < 40000
retained_c_mito <- seu$percent_mito < 14
retained_c_ribo <- seu$percent_ribo < 3
keep_cells <- retained_c_mito & retained_c_ribo & retained_c_umi_low & retained_c_umi_high
scRNA_cells <- subset(seu, cells = Cells(seu)[keep_cells])
counts_rna <- GetAssayData(scRNA_cells, assay = "RNA", slot = "counts")
feature_rowsum <- Matrix::rowSums(counts_rna > 0)
# 基因阈值
retained_f_low <- feature_rowsum > ncol(scRNA_cells) * 0.005
genes_keep <- names(feature_rowsum)[retained_f_low]
counts_herv <- GetAssayData(scRNA_cells, assay = "HERV", slot = "counts")
herv_rowsum <- Matrix::rowSums(counts_herv > 0)
# hERV位点阈值
herv_cutoff <- ceiling(ncol(scRNA_cells) * 0.0001)
herv_keep <- names(herv_rowsum)[herv_rowsum > herv_cutoff]
cat("HERV features before:", nrow(counts_herv), "\n")  # 14969
cat("HERV features kept:", length(herv_keep), "\n")  # 4326

my_GSE157827_11

过滤

scRNA_cells[["RNA"]] <- subset(scRNA_cells[["RNA"]], features = genes_keep)
scRNA_cells[["HERV"]] <- subset(scRNA_cells[["HERV"]], features = herv_keep)
scRNA_cells
table(scRNA_cells$group)
table(scRNA_cells$orig.ident)
rna_counts <- GetAssayData(scRNA_cells, assay = "RNA", slot = "counts")
scRNA_cells$nCount_RNA <- Matrix::colSums(rna_counts)
scRNA_cells$nFeature_RNA <- Matrix::colSums(rna_counts > 0)
herv_counts <- GetAssayData(scRNA_cells, assay = "HERV", slot = "counts")
scRNA_cells$nCount_HERV <- Matrix::colSums(herv_counts)
scRNA_cells$nFeature_HERV <- Matrix::colSums(herv_counts > 0)
scRNA_cells$HERV_fraction <- scRNA_cells$nCount_HERV / (scRNA_cells$nCount_HERV + scRNA_cells$nCount_RNA)

过滤后结果

feats <- c("nFeature_RNA", "nCount_RNA")
p_filt_a_1 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_a_2 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
# HERV?
feats <- c("nCount_HERV", "nFeature_HERV")
p_filt_a_3 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_a_4 <- VlnPlot(
  scRNA_cells, features = feats,
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
p_filt_a_7 <- VlnPlot(
  scRNA_cells, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "group"
) + NoLegend()
p_filt_a_8 <- VlnPlot(
  scRNA_cells, features = "HERV_fraction",
  pt.size = 0,
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
feats <- c("percent_mito", "percent_ribo")
p_filt_a_5 <- VlnPlot(
  scRNA_cells, features = feats, 
  pt.size = 0, 
  ncol = 2,
  group.by = "group"
) + NoLegend()
p_filt_a_6 <- VlnPlot(
  scRNA_cells, features = feats, 
  pt.size = 0, 
  ncol = 1,
  group.by = "orig.ident"
) + NoLegend()
pdf(file.path(res_dir, "after_filter.pdf"), width = 20, height = 20)
p <- (p_filt_a_1 / p_filt_a_2) | (p_filt_a_3 / p_filt_a_4) | (p_filt_a_5 / p_filt_a_6) | (p_filt_a_7 / p_filt_a_8)
print(p)
dev.off()
# 保存数据
saveRDS(
    scRNA_cells, 
    file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds", 
    compress = "xz"
)

最终结果：

> dim(scRNA_cells)  # 不包括hERV
[1]  21002 164219 
> table(scRNA_cells$GSM)
GSM4775561 GSM4775562 GSM4775563 GSM4775564 GSM4775565 GSM4775566 GSM4775567 GSM4775568 GSM4775569 
      5865      15080       5435      12140        230       2167       9238      15703       1517 
GSM4775570 GSM4775571 GSM4775572 GSM4775573 GSM4775574 GSM4775575 GSM4775576 GSM4775577 GSM4775578 
      3549       9037       7691       6056       6130       1520      15185      10596       8175 
GSM4775579 GSM4775580 GSM4775581 
      3554      13900      11451 

作者的计数矩阵过滤后：

> dim(scRNA_filt)
[1]  17276 169516
> table(scRNA_filt$orig.ident)
GSM4775561 GSM4775562 GSM4775563 GSM4775564 GSM4775565 GSM4775566 GSM4775567 GSM4775568 GSM4775569 
      6173      15276       5980      12356        311       2405       9789      16013       1608 
GSM4775570 GSM4775571 GSM4775572 GSM4775573 GSM4775574 GSM4775575 GSM4775576 GSM4775577 GSM4775578 
      3614       9169       7786       6218       6372       1587      15568      10932       8385 
GSM4775579 GSM4775580 GSM4775581 
      4235      14263      11476 

与作者的计数矩阵对照

library(Seurat)
library(tidyverse)
library(patchwork)
library(Matrix)
seu_auth <- readRDS("/public/home/GENE_proc/wth/GSE157827/raw_data/GSE157827_filt.rds")
seu_ours <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds")
res_dir <- "/public/home/GENE_proc/wth/GSE157827/res/"
auth_cells <- colnames(seu_auth)
our_cells <- colnames(seu_ours)
common_cells <- intersect(auth_cells, our_cells)
mat_ours <- GetAssayData(seu_ours, assay = "RNA", slot = "counts")
mat_auth <- GetAssayData(seu_auth, assay = "RNA", slot = "counts")
common_genes <- intersect(rownames(mat_ours), rownames(mat_auth))

> cat("ours cells:", length(our_cells), "\n")
ours cells: 164219 
> cat("auth cells:", length(auth_cells), "\n")
auth cells: 169516 
> cat("common cells:", length(common_cells), "\n")
common cells: 161921 
> cat("overlap % (of ours):", round(length(common_cells)/length(our_cells)*100, 2), "%\n")
overlap % (of ours): 98.6 %
> cat("overlap % (of auth):", round(length(common_cells)/length(auth_cells)*100, 2), "%\n")
overlap % (of auth): 95.52 %
> cat("ours genes:", length(rownames(mat_ours)), "\n")
ours genes: 21002 
> cat("auth genes:", length(rownames(mat_auth)), "\n")
auth genes: 17276 
> cat("common genes:", length(common_genes), "\n")
common genes: 13938 
> cat("overlap % (of ours):", round(length(common_genes)/length(rownames(mat_ours))*100, 2), "%\n")
overlap % (of ours): 66.37 %
> cat("overlap % (of auth):", round(length(common_genes)/length(rownames(mat_auth))*100, 2), "%\n")
overlap % (of auth): 80.68 %

在重叠的细胞上比较计数的一致性：

每个细胞的总UMI：

mat_ours_sub <- mat_ours[common_genes, common_cells, drop = FALSE]
mat_auth_sub <- mat_auth[common_genes, common_cells, drop = FALSE]
our_lib  <- Matrix::colSums(mat_ours_sub)
auth_lib <- Matrix::colSums(mat_auth_sub)
cor_lib <- cor(log10(our_lib + 1), log10(auth_lib + 1))
cat("cell library size corr (log10+1):", cor_lib, "\n")  # 0.9997134
pdf(file.path(res_dir, "author_our_umi.pdf"), width = 10, height = 10)
plot(log10(auth_lib + 1), log10(our_lib + 1),
     xlab = "author log10(lib+1)", ylab = "ours log10(lib+1)", pch = 16, cex = 0.4)
abline(0, 1, col = "red")
dev.off()

my_GSE157827_12

每个基因的总counts：

our_gene_sum  <- Matrix::rowSums(mat_ours_sub)
auth_gene_sum <- Matrix::rowSums(mat_auth_sub)
cor_gene <- cor(log10(our_gene_sum + 1), log10(auth_gene_sum + 1))
cat("gene total counts corr (log10+1):", cor_gene, "\n")  # 0.9746644
pdf(file.path(res_dir, "author_our_counts.pdf"), width = 10, height = 10)
plot(log10(auth_gene_sum + 1), log10(our_gene_sum + 1),
     xlab = "author log10(geneSum+1)", ylab = "ours log10(geneSum+1)", pch = 16, cex = 0.4)
abline(0, 1, col = "red")
dev.off()

my_GSE157827_13

每个细胞的基因向量相关性（随机抽3k细胞×3k基因）：

set.seed(1)
v_my <- as.vector(log1p(mat_ours_sub))
v_auth <- as.vector(log1p(mat_auth_sub))
cells_sample <- sample(colnames(mat_ours_sub), 3000)
genes_sample <- sample(rownames(mat_ours_sub), 3000)
my_sub <- as.matrix(mat_ours_sub[genes_sample, cells_sample])
auth_sub <- as.matrix(mat_auth_sub[genes_sample, cells_sample])
v_my_sub <- as.vector(log1p(my_sub))
v_auth_sub <- as.vector(log1p(auth_sub))
cor_flat <- cor(v_my_sub, v_auth_sub)
cor_flat  # 0.9677604
pdf(file.path(res_dir, "author_our_corr.pdf"), width = 10, height = 10)
par(pin = c(10, 10))
plot(v_my_sub, v_auth_sub, pch = ".", xlab = "log1p(my)", ylab = "log1p(author)")
abline(0, 1, col = "red")
dev.off()

my_GSE157827_14

抽取一部分细胞到本地

scRNA_cells <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds")
set.seed(20260102)
prop_keep <- 1/3
cells_keep <- scRNA_cells@meta.data %>%
  rownames_to_column("cell") %>%
  group_by(orig.ident) %>%
  slice_sample(prop = prop_keep) %>%
  pull(cell)
scRNA_smaller <- subset(scRNA_cells, cells = cells_keep)
saveRDS(
    scRNA_smaller, 
    file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt_smaller.rds", 
    compress = "xz"
)

> ncol(scRNA_smaller)
[1] 54731
> table(scRNA_smaller$group)
   NC    AD 
25519 29212 
> table(scRNA_smaller$orig.ident)
 AD1 AD10 AD13 AD19  AD2 AD20 AD21  AD4  AD5  AD6  AD8  AD9 NC11 NC12 NC14 NC15 NC16 NC17 NC18  NC3 
1955 5234  505 1183 5026 3012 2563 1811 4046   76  722 3079  506 5061 3532 2725 1184 4633 3817 2018 
 NC7 
2043 

数据预处理

library(Seurat)
library(tidyverse)
library(patchwork)
library(Matrix)
library(clustree)
# seu <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_filt_smaller.rds")
# res_dir <- "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_res"
seu <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_filt.rds")
res_dir <- "/public/home/GENE_proc/wth/GSE157827/res/"

RNA：常规预处理

seu <- NormalizeData(
  seu,
  assay = "RNA",
  normalization.method = "LogNormalize",
  scale.factor = 10000
)
seu <- FindVariableFeatures(
  seu,
  assay = "RNA",
  selection.method = "vst",
  nfeatures = 2000
)
seu <- ScaleData(
  seu,
  assay = "RNA",
  features = VariableFeatures(seu),
  vars.to.regress = c("nFeature_RNA", "percent_mito")
)
seu <- RunPCA(
  seu,
  assay = "RNA",
  features = VariableFeatures(seu),
  npcs = 50
)
pdf(file.path(res_dir, "ElbowPlot_all.pdf"))
p <- ElbowPlot(seu, ndims = 50, reduction = "pca")
print(p)
dev.off()

my_GSE157827_18

pc.use <- 1:30
seu <- RunUMAP(seu, reduction = "pca", dims = pc.use)
p_dim_group <- DimPlot(seu, group.by = "group")
p_dim_gsm <- DimPlot(seu, group.by = "orig.ident", raster = TRUE)
seu <- FindNeighbors(seu, dims = pc.use)
seu <- FindClusters(seu, resolution = c(0.01, 0.05, 0.1, 0.2, 0.5))
p_cutree <- clustree(seu@meta.data, prefix = "RNA_snn_res.")
# 选一个resolution作为最终cluster标签（教程选0.05）
Idents(seu) <- seu$RNA_snn_res.0.05
# 把cluster编号标到UMAP上
p_dim_cluster <- DimPlot(seu, label = TRUE)
# 拼图展示
pdf(file.path(res_dir, "data_preprocess.pdf"), width = 16, height = 16)
p <- (p_dim_group | p_dim_gsm) / (p_cutree | p_dim_cluster)
print(p)
dev.off()

my_GSE157827_23

按样本混合的较均匀，目前不需要去批次效应

HERV：特殊方法

用RNA的库深度(nCount_RNA)做LogNormalize

normalize_herv_by_rna_depth <- function(seu, scale.factor = 10000) {
  rna_counts  <- GetAssayData(seu, assay = "RNA",  slot = "counts")
  herv_counts <- GetAssayData(seu, assay = "HERV", slot = "counts")
  lib_rna <- Matrix::colSums(rna_counts)  # 基因counts作size factor
  herv_norm <- t(t(herv_counts) / lib_rna) * scale.factor
  herv_norm@x <- log1p(herv_norm@x)
  seu <- SetAssayData(seu, assay = "HERV", slot = "data", new.data = herv_norm)
  seu$HERV_fraction <- seu$nCount_HERV / (seu$nCount_HERV + seu$nCount_RNA)
  return(seu)
}
DefaultAssay(seu) <- "HERV"
seu <- normalize_herv_by_rna_depth(seu, scale.factor = 10000)

检查标准化结果是否正确

get_mat <- function(seu, assay, slot) {
  tryCatch(
    GetAssayData(seu, assay = assay, slot = slot),
    error = function(e) LayerData(seu, assay = assay, layer = slot)
  )
}
rna_counts  <- get_mat(seu, "RNA",  "counts")
herv_counts <- get_mat(seu, "HERV", "counts")
herv_data   <- get_mat(seu, "HERV", "data")
# RNA库深度一致性
rna_lib <- Matrix::colSums(rna_counts)
cat("cor(meta nCount_RNA, colSums(RNA counts)) = ",
    cor(seu$nCount_RNA, rna_lib), "\n")
print(summary(seu$nCount_RNA - as.numeric(rna_lib)))
# 验证计算正确性
set.seed(1)
cells <- sample(colnames(seu), 200)
feats <- sample(rownames(herv_counts), min(200, nrow(herv_counts)))
hsub <- herv_counts[feats, cells, drop = FALSE]
rna_lib_sub <- as.numeric(rna_lib[cells])
manual <- log1p(t(t(as.matrix(hsub)) / rna_lib_sub * 10000))
stored <- as.matrix(herv_data[feats, cells, drop = FALSE])
cat("max(abs(manual - stored)) = ", max(abs(manual - stored)), "\n")
cat("cor(manual, stored) = ", cor(as.vector(manual), as.vector(stored)), "\n")
# HERV_fraction一致性
herv_lib <- Matrix::colSums(herv_counts)
frac_manual <- as.numeric(herv_lib) / (as.numeric(rna_lib) + as.numeric(herv_lib))
cat("cor(meta HERV_fraction, manual) = ", cor(seu$HERV_fraction, frac_manual), "\n")
print(summary(seu$HERV_fraction - frac_manual))
# 标准化后是否仍强依赖RNA库深度
# HERV线性归一化总量（未log）与RNA库深度（应明显减弱相关）
herv_norm_linear <- as.numeric(herv_lib) / as.numeric(rna_lib) * 10000
plot(log10(as.numeric(rna_lib) + 1), log10(herv_norm_linear + 1),
     xlab = "log10(RNA library size + 1)",
     ylab = "log10(HERV normalized sum + 1)")
# 随便挑一个HERV feature（你也可以换成你感兴趣的ERV位点）看FeaturePlot
DefaultAssay(seu) <- "HERV"
feat1 <- rownames(herv_counts)[3]
FeaturePlot(seu, features = feat1, reduction = "umap")

my_GSE157827_16

my_GSE157827_17

> cat("cor(meta nCount_RNA, colSums(RNA counts)) = ", cor(seu$nCount_RNA, rna_lib), "\n")
cor(meta nCount_RNA, colSums(RNA counts)) =  1 
> print(summary(seu$nCount_RNA - as.numeric(rna_lib)))
   Min. 1st Qu.  Median    Mean 3rd Qu.  Max.
      0       0       0       0       0    0
> cat("max(abs(manual - stored)) = ", max(abs(manual - stored)), "\n")
max(abs(manual - stored)) =  0 
> cat("cor(manual, stored) = ", cor(as.vector(manual), as.vector(stored)), "\n")
cor(manual, stored) =  1 
> cat("cor(meta HERV_fraction, manual) = ", cor(seu$HERV_fraction, frac_manual), "\n")
cor(meta HERV_fraction, manual) =  1 
> print(summary(seu$HERV_fraction - frac_manual))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
      0       0       0       0       0       0 

标准化结果是合理的

对于第一张图，随着RNA_lib增大，hERV总counts会按1/RNA_lib下降，因此出现成排斜线（每条线对应一个固定的系数）；同时低RNA库深度的细胞上hERV方差更大，因此左侧更散
对于第二张图，每细胞HERV总量≈8、RNA_lib≈2700，标准化后范围大概在1.3~1.7
大部分hERV位点在大部分细胞里计数为0
标准化后，非0的值通常在0~3左右

细胞注释

DefaultAssay(seu) <- "RNA"
# marker基因
gene_list <- list(
  Astro = c("AQP4", "ADGRV1", "GPC5", "RYR3"),
  Endo  = c("CLDN5", "ABCB1", "EBF1"),
  Excit = c("CAMK2A", "CBLN2", "LDB2"),
  Inhib = c("GAD1", "LHFPL3", "PCDH15"),
  Mic   = c("C3", "LRMDA", "DOCK8"),
  Oligo = c("MBP", "PLP1", "ST18")
)
# 在各个聚类中的表达量
pdf(file.path(res_dir, "cluster_marker_gene.pdf"), width = 16, height = 12)
p <- DotPlot(seu, features = gene_list, assay = "RNA") +
  theme(axis.text.x = element_text(angle = 60, vjust = 0.8, hjust = 0.8))
print(p)
dev.off()
# 计算每个cluster对每个细胞类型的marker-set的平均表达
score_by_cluster <- function(seu, gene_list, group_col) {
  out <- lapply(names(gene_list), function(ct) {
    feats <- intersect(gene_list[[ct]], rownames(seu))
    if (length(feats) == 0) return(NULL)
    avg <- AverageExpression(
      seu,
      assays = "RNA",
      features = feats,
      group.by = group_col,
      slot = "data",
      verbose = FALSE
    )$RNA  # features x clusters
    data.frame(
      cluster = colnames(avg),
      celltype = ct,
      score = colMeans(avg, na.rm = TRUE),
      stringsAsFactors = FALSE
    )
  }) |> bind_rows()
  tab <- out |>
    pivot_wider(names_from = celltype, values_from = score) |>
    arrange(as.numeric(as.character(cluster)))
  mat <- as.matrix(tab[, intersect(names(gene_list), colnames(tab)), drop = FALSE])
  tab$pred <- colnames(mat)[max.col(mat, ties.method = "first")]
  tab
}
score_tab <- score_by_cluster(seu, gene_list, "RNA_snn_res.0.05")
print(score_tab, n = nrow(score_tab))

抽样数据：

my_GSE157827_19

my_GSE157827_20

Oligodendrocyte：0
Astrocyte：2
Microglia：7
Endothelial：9
Inhibitory neuron：4、5、6、11
Excitatory neuron：1、3、8、10、12

全部数据：

my_GSE157827_21

my_GSE157827_22

很巧合的，全部数据的聚类结果和抽样数据的完全相同

cl_Astro <- c("2")
cl_Endo  <- c("9")
cl_Mic   <- c("7")
cl_Oligo <- c("0")
cl_Inhib <- c("4", "5", "6", "11") 
# 其它细胞归为Excit
cl <- as.character(seu[["RNA_snn_res.0.05"]][, 1])
seu$celltype <- ifelse(cl %in% cl_Astro, "Astro",
                  ifelse(cl %in% cl_Endo,  "Endo",
                    ifelse(cl %in% cl_Mic,   "Mic",
                      ifelse(cl %in% cl_Oligo, "Oligo",
                        ifelse(cl %in% cl_Inhib, "Inhib", "Excit")))))
seu$celltype <- factor(seu$celltype, levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo"))
Idents(seu) <- seu$celltype
table(seu$celltype, seu$group)
# 保存数据
saveRDS(
    seu, 
    # file = "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_smaller_celltype.rds", 
    file = "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_celltype.rds",
    compress = "xz"
)

> table(seu$celltype, seu$group)
           NC    AD
  Astro  8411  9695
  Endo    566  1589
  Excit 28234 31031
  Inhib 17585 18747
  Mic    3555  3909
  Oligo 18216 22681
> table(seu$celltype)
Astro  Endo Excit Inhib   Mic Oligo 
18106  2155 59265 36332  7464 40897 

教程里的注释：

> table(scRNA$celltype, useNA = "ifany")
Astro  Endo Excit Inhib   Mic Oligo 
19120  2241 62093 37239  7773 41050 

基本没有差别

画图展示

ct_cols <- c(
  Astro = "#1b9e77",
  Endo  = "#d95f02",
  Excit = "#7570b3",
  Inhib = "#e7298a",
  Mic   = "#66a61e",
  Oligo = "#e6ab02"
)
# 细胞类型在umap上的分布
dim_plot <- DimPlot(
  seu, 
  reduction = "umap", 
  cols = ct_cols, 
  label = FALSE
)
# 统计各细胞类型的细胞数和比例
ct_stat <- as.data.frame(table(seu$celltype))
colnames(ct_stat) <- c("celltype", "total")
ct_stat$celltype <- factor(ct_stat$celltype, levels = levels(seu@active.ident))
ct_stat$percentage <- 100 * ct_stat$total / sum(ct_stat$total)
celltype_bar <- ggplot(ct_stat, aes(x = celltype, y = percentage, fill = celltype)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = ct_cols) +
  theme_classic() +
  theme(axis.title.x = element_blank(), legend.position = "none") +
  ylab("Proportion of total nuclei (%)")
# 各细胞类型marker热图
genes_paper <- c(
  "SLC1A2","ADGRV1","GPC5","RYR3","GFAP",
  "CLDN5","FLT1","ABCB1","EBF1","MT2A",
  "RALYL","KCNIP4","CBLN2","LDB2","KCNQ5",
  "NXPH1","LHFPL3","PCDH15","GRIK1","ADARB2",
  "LRMDA","DOCK8","ARHGAP24","ARHGAP15","PLXDC2",
  "ST18","PLP1","CTNNA3","MBP","PIP4K2A"
)
genes_use <- intersect(genes_paper, rownames(seu))
length(genes_use) == length(genes_paper)  # 全在我的数据集中
seu <- ScaleData(seu, features = genes_use, verbose = FALSE)
heatmap <- DoHeatmap(
  seu,
  features = genes_use,
  disp.min = -1.5,
  disp.max = 1.5,
  angle = 0,
  group.colors = unname(ct_cols)
) + scale_fill_gradientn(
  colors = c("#f100ef", "black", "#f0e442"),
  limits = c(-1.5, 1.5)
)
pdf(file.path(res_dir, "celltype.pdf"), width = 24, height = 16)
p <- dim_plot + celltype_bar / heatmap
print(p)
dev.off()

my_GSE157827_15

计算差异基因，便于后面计算富集分数

scRNA <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_celltype.rds")
DefaultAssay(scRNA) <- "RNA"
scRNA$group <- factor(scRNA$group, levels = c("NC","AD"))
scRNA$celltype <- factor(
  scRNA$celltype,
  levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo")
)
Idents(scRNA) <- scRNA$celltype
balance_cells <- function(obj, ct){
  sub <- subset(obj, idents = ct)
  ad <- WhichCells(sub, expression = group == "AD")
  nc <- WhichCells(sub, expression = group == "NC")
  if(length(ad) > 2000){
    n <- min(length(ad), length(nc))
    sub[, c(sample(ad, n), sample(nc, n))]
  } else {
    sub
  }
}
cts <- levels(scRNA$celltype)
obj_bal <- merge(x = balance_cells(scRNA, cts[1]),
                 y = lapply(cts[-1], \(ct) balance_cells(scRNA, ct)))
obj_bal$celltype <- factor(
  obj_bal$celltype,
  levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo")
)
obj_bal <- JoinLayers(obj_bal, assay = "RNA")
obj_bal$compare <- paste(obj_bal$group, obj_bal$celltype, sep = "_")
scRNA$compare <- paste(scRNA$group, scRNA$celltype, sep = "_")
ct <- levels(obj_bal$celltype)
all_markers <- lapply(ct, function(x){
  message("Running: ", x)
  FindMarkers(
    obj_bal,
    group.by = "compare",
    ident.1 = paste0("AD_", x),
    ident.2 = paste0("NC_", x),
    logfc.threshold = 0.25,
    min.pct = 0.1
  )
})
names(all_markers) <- ct
lapply(all_markers, nrow)
all_markers_sig <- lapply(all_markers, function(df){
  df2 <- subset(
    df, 
    p_val_adj < 0.1
  )
  return(df2)
})
View(all_markers_sig[["Astro"]])
save(all_markers_sig, file = "/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_all_markers_sig.RData")

差异表达分析

library(Seurat)
library(tidyverse)
library(patchwork)
library(Matrix)
library(clustree)
library(edgeR)
# seu <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_smaller_celltype.rds")
# res_dir <- "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_res"
seu <- readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/GSE157827_celltype.rds")
res_dir <- "/public/home/GENE_proc/wth/GSE157827/res/"

细胞类型level

根据开题报告里的大致思路，先尝试pseudo-bulk

sample_col <- "orig.ident"
seu$sample <- seu@meta.data[[sample_col]]
seu$group  <- factor(seu$group, levels = c("NC", "AD"))
# 每个细胞类型在AD/NC里各有多少个样本（该样本在该细胞类型下至少有1个细胞）
cell_n <- as.data.frame(table(celltype = seu$celltype, sample = seu$sample, group = seu$group)) %>%
  filter(Freq > 0)
sample_stat <- cell_n %>%
  group_by(celltype, group) %>%
  summarise(
    n_samples = n_distinct(sample),
    min_cells = min(Freq),
    median_cells = as.numeric(median(Freq)),
    .groups = "drop"
  ) %>%
  arrange(celltype, group)
# pseudo-bulk的计数矩阵（按细胞类型+样本聚合）
pb <- AggregateExpression(
  seu,
  assays = c("RNA", "HERV"),
  group.by = c("celltype", "sample"),
  slot = "counts",
  verbose = FALSE
)
pb_rna_counts  <- pb$RNA
pb_herv_counts <- pb$HERV
# 添加AD/NC信息
cn <- colnames(pb_herv_counts)
pb_meta <- tibble(pb_col = cn) %>%
  mutate(parts = strsplit(pb_col, "_")) %>%
  rowwise() %>%
  mutate(
    celltype = parts[[1]][1],
    sample   = parts[[2]][1]
  ) %>%
  ungroup()
sample2group <- seu@meta.data %>%
  distinct(sample, group)
pb_meta <- pb_meta %>%
  left_join(sample2group, by = "sample") %>%
  mutate(group = factor(group, levels = c("NC","AD")))
# 检查NA
print(table(is.na(pb_meta$group)))
print(table(pb_meta$celltype, pb_meta$group))
# 保存数据
saveRDS(
  list(pb_rna_counts = pb_rna_counts, pb_herv_counts = pb_herv_counts, sample_stat = sample_stat, pb_meta = pb_meta),
  file = "/public/home/GENE_proc/wth/GSE157827/my_data/pseudobulk_counts_by_celltype_sample.rds"
)

先在一个细胞类型上算hERV的差异，这里选用astro做test

# readRDS("/public/home/GENE_proc/wth/GSE157827/my_data/pseudobulk_counts_by_celltype_sample.rds")
data <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\pseudobulk_counts_by_celltype_sample.rds")
pb_rna_counts = data[["pb_rna_counts"]]; pb_herv_counts = data[["pb_herv_counts"]]; sample_stat = data[["sample_stat"]]; pb_meta = data[["pb_meta"]]; rm(data)
run_edger_one <- function(count_mat, pb_meta, celltype) {
  idx <- pb_meta$celltype == celltype
  meta <- pb_meta[idx, ]
  y <- DGEList(counts = count_mat[, idx, drop = FALSE])
  y$samples$group <- meta$group
  cat("features before filterByExpr:", nrow(y), "\n")
  keep <- filterByExpr(y, group = y$samples$group)
  y <- y[keep, , keep.lib.sizes = FALSE]
  cat("features after  filterByExpr:", nrow(y), "\n")
  y <- calcNormFactors(y)
  design <- model.matrix(~ group, data = y$samples)  # NC为基线
  y <- estimateDisp(y, design)
  fit <- glmQLFit(y, design)
  qlf <- glmQLFTest(fit, coef = "groupAD")
  res <- topTags(qlf, n = Inf)$table
  res$feature <- rownames(res)
  rownames(res) <- NULL
  res
}
res_herv_astro <- run_edger_one(pb_herv_counts, pb_meta, "Astro")
View(res_herv_astro)

features before filterByExpr: 4326 
features after  filterByExpr: 92 

my_GSE157827_24

不是“多重校正把信号掩盖了（92不算多），而是信号本身就不够强，即pseudo-bulk在astro大类的hERV差异上不显著（不等于“完全没差异”，只是在现在的方法下达不到显著）

之后我尝试了普通单细胞的方法(FindMarkers)

astro <- subset(seu, subset = celltype == "Astro")
DefaultAssay(astro) <- "HERV"
Idents(astro) <- astro$group  # AD vs NC
sc_herv_astro <- FindMarkers(
  astro,
  ident.1 = "AD",
  ident.2 = "NC",
  assay = "HERV",
  slot = "data",
  test.use = "MAST",
  latent.vars = c("orig.ident")  # 把样本作为协变量，减轻伪重复
)

my_GSE157827_25

去掉latent.vars = c("orig.ident")后

my_GSE157827_26

结果和pseudo-bulk也类似——把“样本/个体”作为协变量后，也只有少数能保持差异

对所有细胞类型都运行一遍pseudo-bulk分析流程

run_edger_one2 <- function(count_mat, pb_meta, celltype) {
  idx <- pb_meta$celltype == celltype
  meta <- pb_meta[idx, ]
  y <- DGEList(counts = count_mat[, idx, drop = FALSE])
  y$samples$group <- meta$group
  n_before <- nrow(y)
  keep <- filterByExpr(y, group = y$samples$group)
  y <- y[keep, , keep.lib.sizes = FALSE]
  n_after <- nrow(y)
  y <- calcNormFactors(y)
  design <- model.matrix(~ group, data = y$samples)
  y <- estimateDisp(y, design)
  fit <- glmQLFit(y, design)
  qlf <- glmQLFTest(fit, coef = "groupAD")
  res <- topTags(qlf, n = Inf)$table
  tibble(
    celltype = celltype,
    tested = n_after,
    minP = min(res$PValue),
    n_FDR_0.05 = sum(res$FDR < 0.05),
    n_FDR_0.10 = sum(res$FDR < 0.10),
    n_FDR_0.20 = sum(res$FDR < 0.20)
  )
}
cts <- levels(seu$celltype)
sum_tab <- bind_rows(lapply(cts, \(ct) run_edger_one2(pb_herv_counts, pb_meta, ct))) %>%
  arrange(minP)
View(sum_tab)
sum_tab_rna <- bind_rows(lapply(cts, \(ct) run_edger_one2(pb_rna_counts, pb_meta, ct))) %>%
  arrange(minP)
View(sum_tab_rna)

my_GSE157827_28

可以看到，我的pseudo-bulk HERV差异结果整体偏弱（在FDR意义下），同时Mic和Endo两类细胞把绝大多数HERV位点都过滤掉了（常见原因：伪bulk库里HERV总计数很低/太稀疏）。说明HERV在这个数据集、这个粒度（6大类）下的稳定差异可能本来就很弱，需要换一种更适合稀疏特征的过滤/检验策略

my_GSE157827_27

这说明我们构建pseudo-bulk矩阵+分组+edgeR的流程是没问题的，但RNA在“6大类细胞类型”这个粒度下也整体不显著，说明这个数据在“粗粒度细胞类型+简单group(AD/NC)”下，稳定差异本来就弱，再加上HERV更稀疏，所以更难显著。因此可以先细化差异分析的粒度，比如在某个细胞类型中划分亚群，在亚群中做伪bulk差异

细胞类型亚群level

my_GSE157827_29 my_GSE157827_30

my_GSE157827_31

> table(Idents(scRNA_astro))
  a1   a2   a4  a11   a7   a8  a12   a6  a10   a5   a9   a3 
3557 3545 1880  369 1250  940  151 1275  396 1591  469 2683 
> table(scRNA_astro$sle_reso, scRNA_astro$group)
        NC   AD
  a1  2011 1546
  a10  367   29
  a11  296   73
  a12   40  111
  a2  1936 1609
  a3   248 2435
  a4  1834   46
  a5    27 1564
  a6   610  665
  a7   631  619
  a8   378  562
  a9    33  436

教程的聚类结果：好像差不太多

  a1   a2   a8   a5   a4   a7   a9   a6   a3 
7043 3040  854 1394 1414 1059  550 1321 2445 

还是先计算一下富集分数（hERV的差异分析暂时用不到，后面可能用到，这块顺手做一下）

load("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_all_markers_sig.RData")
astro_up   <- rownames(subset(all_markers_sig$Astro, avg_log2FC > 0))
astro_down <- rownames(subset(all_markers_sig$Astro, avg_log2FC < 0))
scRNA_astro <- AddModuleScore(scRNA_astro, features = list(astro_up),   name = "ADup")
scRNA_astro <- AddModuleScore(scRNA_astro, features = list(astro_down), name = "ADdown")
VlnPlot(scRNA_astro, features = c("ADup1","ADdown1"), group.by = "sle_reso", pt.size = 0) / FeaturePlot(scRNA_astro, features = c("ADup1","ADdown1"))
df <- FetchData(scRNA_astro, vars = c("sle_reso","ADup1","ADdown1","group"))
score_tab <- df %>%
  group_by(sle_reso) %>%
  summarise(
    mean_up   = mean(ADup1),
    mean_down = mean(ADdown1),
    delta = mean_up - mean_down,
    n = n()
  ) %>%
  arrange(desc(delta))
View(score_tab)

my_GSE157827_32

my_GSE157827_33

up_core   <- c("a3", "a9", "a12", "a5")
down_core <- c("a4", "a10", "a11")
scRNA_astro$subpop <- "No change"
scRNA_astro$subpop[scRNA_astro$sle_reso %in% up_core]   <- "Up"
scRNA_astro$subpop[scRNA_astro$sle_reso %in% down_core] <- "Down"
scRNA_astro$subpop <- factor(scRNA_astro$subpop, levels=c("Up","Down","No change"))
prop <- prop.table(table(scRNA_astro$sle_reso, scRNA_astro$group), 1)
# 保存数据
saveRDS(
  scRNA_astro, 
  file = "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_astro.rds", 
  compress = "xz"
)

> table(scRNA_astro$subpop)
       Up      Down No change 
     4894      2645     10567 
> prop[c(up_core, down_core), ]
              NC         AD
  a3  0.09243384 0.90756616
  a9  0.07036247 0.92963753
  a12 0.26490066 0.73509934
  a5  0.01697046 0.98302954
  a4  0.97553191 0.02446809
  a10 0.92676768 0.07323232
  a11 0.80216802 0.19783198

选出用于hERV差异分析的亚群：

如果一个样本在该亚群里≥20细胞，则算做一个有效样本，如果一个亚群的AD/NC有效样本都多于5个，就用这个亚群
hERV差异分析是否应该在上面得到的差异明显的亚群里做？一个问题是这些亚群往往都只有极少数AD或NC细胞，而pseudo-bulk必须要求样本数够多
- 看来选那些NC-AD数比较均衡的亚群更好？
- 仿照之前教程复现中的做法，根据Up/Down将亚群合并，做Up/Down的hERV差异分析？
  
  然而这种方法也不能回避关键问题：某个样本要么是来自NC，要么是来自AD，而up/down中也是“up的AD多，down的NC多”，导致一个样本无法有足够多的、被划分成up/down的细胞
都试一下：
- 路线A：只在“AD/NC两组都有足够样本贡献”的亚群里做表达差异
- 路线B：按Up/Down做“普通单细胞”的HERV差异

先尝试路线A

# 选出适合pseudo-bulk的亚群
min_cells <- 20
min_samples_each_group <- 6
cell_tab <- scRNA_astro@meta.data %>%
  dplyr::transmute(sub = sle_reso, group = group, sample = orig.ident) %>%
  dplyr::count(sub, group, sample, name = "n_cells")
sum_tab <- cell_tab %>%
  dplyr::group_by(sub, group) %>%
  dplyr::summarise(n_samples_ge = sum(n_cells >= min_cells), .groups="drop") %>%
  tidyr::pivot_wider(names_from = group, values_from = n_samples_ge, values_fill = 0) %>%
  dplyr::mutate(keep = (NC >= min_samples_each_group) & (AD >= min_samples_each_group)) %>%
  dplyr::arrange(dplyr::desc(keep), sub)
keep_subs <- sum_tab %>% dplyr::filter(keep) %>% dplyr::pull(sub)
keep_subs  # "a1" "a2" "a6" "a7"

先拿a1做一个小测试

run_pb_herv_one_sub <- function(obj, sub_id, min_cells = 20, min_cpm = 1, min_samp = 3) {
  sub_obj <- subset(obj, subset = sle_reso == sub_id)
  pb_herv <- AggregateExpression(
    sub_obj,
    assays = "HERV",
    slot = "counts",
    group.by = "orig.ident",
    return.seurat = FALSE
  )$HERV
  saminfo <- sub_obj@meta.data %>%
    distinct(orig.ident, group) %>%
    rename(sample = orig.ident) %>%
    filter(sample %in% colnames(pb_herv)) %>%
    arrange(match(sample, colnames(pb_herv)))
  stopifnot(all(saminfo$sample == colnames(pb_herv)))
  saminfo$group <- factor(saminfo$group, levels = c("NC", "AD"))
  print(table(saminfo$group))
  cat("pb_herv dim:", paste(dim(pb_herv), collapse = " x "), "\n")
  y <- DGEList(counts = pb_herv, group = saminfo$group)
  cpm_mat <- cpm(y)
  keep <- (rowSums(cpm_mat[, saminfo$group == "NC", drop=FALSE] > min_cpm) >= min_samp) |
          (rowSums(cpm_mat[, saminfo$group == "AD", drop=FALSE] > min_cpm) >= min_samp)
  y <- y[keep, , keep.lib.sizes = FALSE]
  cat("features kept after CPM filter:", nrow(y), "\n")
  y <- calcNormFactors(y, method = "TMM")
  design <- model.matrix(~ group, data = saminfo)
  y <- estimateDisp(y, design, robust = TRUE)
  fit <- glmQLFit(y, design, robust = TRUE)
  qlf <- glmQLFTest(fit, coef = "groupAD")
  res <- topTags(qlf, n = Inf)$table %>%
    tibble::rownames_to_column("feature")
  cat("min PValue:", min(res$PValue), "\n")
  cat("min FDR  :", min(res$FDR), "\n")
  cat("FDR<0.05:", sum(res$FDR < 0.05), "\n")
  cat("FDR<0.10:", sum(res$FDR < 0.10), "\n")
  cat("FDR<0.20:", sum(res$FDR < 0.20), "\n")
  list(res = res, pb = pb_herv, saminfo = saminfo)
}
out_a1 <- run_pb_herv_one_sub(scRNA_astro, sub_id = "a1")
View(out_a1$res)

NC AD 
 9 11 
pb_herv dim: 4326 x 20 
features kept after CPM filter: 405 
min PValue: 0.0004331899 
min FDR  : 0.122591 
FDR<0.05: 0 
FDR<0.10: 0 
FDR<0.20: 4 

my_GSE157827_34

因为调整了feature的过滤方法，所以filterByExpr后剩下405个，比较正常。FDR最小为0.1226，在“稀疏+多重检验+效应可能不大”的场景下还算合理（HERV在样本间离散度大、且很多feature在一部分样本几乎0），因此现在能看到一些候选，但还不足以说强显著

因为hERV太稀疏，所以还要看是不是少数样本/outlier值影响了结果

最后确实得到的结果FDR都有些太大了，虽然经过检验（画小提琴图展示样本计数分布、leave-one-out重新计算）很多信号不是被某一个样本单点“拽出来”的那种极端情况，但不同样本对显著性会有影响

尝试路线B

在之前的教程中，我们在“找Up-Down的差异基因并GO富集”的步骤中：

marker_Astro <- FindMarkers(
  scRNA_astro_sub, 
  group.by = "subpop",
  ident.1 = "Down", 
  ident.2 = "Up"
)
marker_Astro <- marker_Astro[order(marker_Astro$p_val_adj, decreasing = F), ]
marker_Astro_sig <-  marker_Astro %>%
  rownames_to_column("gene") %>%
  filter(p_val_adj < 0.05, abs(avg_log2FC) >= 0.25) %>% 
  filter(pct.1 >= 0.1 | pct.2 >= 0.1) %>% 
  filter(!grepl("^(ENSG|MT-|LINC)|^(AC|AL)[0-9]", gene))

这里先简单参照一下

scUD <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\my_data\\GSE157827_astro_sub.rds")
DefaultAssay(scUD) <- "HERV"
Idents(scUD) <- "subpop"
# Wilcoxon：不校正协变量
mk_wilcox <- FindMarkers(
  scUD,
  ident.1 = "Up", ident.2 = "Down",
  test.use = "wilcox",
  logfc.threshold = 0.5,
  min.pct = 0.01
)
View(mk_wilcox %>% arrange(p_val_adj))
# MAST：把“样本/库深度/线粒体”等作为协变量
mk_mast <- FindMarkers(
  scUD,
  ident.1 = "Up", ident.2 = "Down",
  test.use = "MAST",
  latent.vars = c("orig.ident", "nCount_RNA", "percent_mito"),
  logfc.threshold = 0.5,
  min.pct = 0.01
)
View(mk_mast %>% arrange(p_val_adj))

my_GSE157827_35

my_GSE157827_36

因为如果把样本作为协变量，由于每个样本内不可能同时有大量的AD和NC细胞，所以p_adj就变成了1

虽然教程里是这么做的，但GPT提出了一个问题：up-down的状态差异是否等于AD-NC的疾病差异，是否应该“在同一亚群/同一状态内比较（比如路线A）”，或者只在AD/NC内比较Up-Down，如果某些hERV确实是疾病差异，应该在多个亚群内都能看到一致方向的AD-NC差异

细胞轨迹test

简介

以“Monocle3/Slingshot/tradeSeq”这三种工具为例

简单来说：

Monocle3：想把细胞排序（例如构建pseudotime），并得到分支/主干结构。适合发育/分化、免疫激活连续谱、应激/反应性渐变等情况
Slingshot：已经有Seurat聚类（UMAP/PCA），只想在这个结构上找主线/分支。适合亚群内连续谱（而不是“从零建图”）
tradeSeq：给定pseudotime（来自Monocle/Slingshot等），对每个基因拟合GAM平滑曲线，然后做统计检验，回答“表达随状态轴如何变化、组间曲线是否不同”的问题，即“把轨迹变成可检验的差异结论”

所以，Monocle3和Slingshot是上游分析，tradeSeq是下游分析

Monocle3：降维 → learn_graph → order_cells → pseudotime

轨迹骨架(principal graph)：不是简单连cluster，而是试图在低维空间里学出一条“主干+分支”的结构
pseudotime：每个细胞在骨架上的位置，只代表“顺序/相对位置”，不等价真实时间，需要root（起点）
分支/分区(partition/branch)：把结构分成若干区域，分支点附近常有“状态转换”意义，每个分支不一定是“真实生物学分化”，有时是UMAP/图学习导致的结构
相比Slingshot，如果想要得到骨架图（像发育谱系那样），或者认为之前得到的聚类不完全准确、不想完全依赖聚类拓扑，就使用Monocle3

Slingshot：低维坐标（PCA/UMAP）+ cluster标签 → 在聚类间构建连接结构 → 在细胞层面拟合principal curves得到轨迹

路径(Lineage)：在“从起点cluster出发”的拓扑下，存在多个可能终点/分支路径
pseudotime矩阵(pt_mat)：cells × lineages，每个细胞在每条lineage上都有一个pseudotime，如果不属于该路径就是NA

一个细胞可能只属于某一条路径，也可能在分支附近对多条路径都有一定归属
cellWeights矩阵(cw_mat)：cells × lineages，这个细胞对某条lineage的归属程度/权重，分支点附近的细胞可能对两条lineage都有权重
相比Monocle3，如果已经有准确聚类，且“状态谱”更多是聚类的连续过渡，希望轨迹推断对聚类结构更稳健，就使用Slingshot

tradeSeq：

输入：
- 原始计数矩阵
- pseudotime：来自Monocle/Slingshot等
- cellWeights：存在分支、有多条路径时很重要
- conditions：如AD/NC，用来比较曲线
输出：拟合模型对象gam——每个基因都有一条（或多条lineage × condition的）平滑表达曲线模型
常用检验：
- associationTest(gam)：这个基因是否随pseudotime变化，适合找“状态轴上的标志基因/动态基因”
- conditionTest(gam)：AD-NC的曲线形状/水平是否不同，组间在轨迹上是否不同步/不同形态
- startVsEndTest(gam)：起点与终点是否显著不同，体现稳态与反应态的差异
- earlyDETest(gam)：两组在轨迹早期就分开了吗（“早期偏移”）
- diffEndTest(gam)：不同分支终点的表达是否不同（分支终末态比较）

总结：

Monocle3：在该细胞类型内部学习到一条（或多条分支的）连续状态轨迹；伪时间刻画了从稳态向反应/应激态的连续过渡
Slingshot：基于聚类拓扑与低维嵌入，拟合得到若干条潜在状态路径；细胞在主路径上的伪时间可用于描述状态推进，并用于下游动态分析
tradeSeq：沿伪时间拟合表达平滑曲线，识别出显著随轨迹变化的基因，以及在AD与NC间轨迹动态显著不同的基因/模块，提示疾病改变了同一状态轴上的转变过程

以astro细胞为例，直接使用之前已完成预处理的Seurat对象

Monocle3的pseudotime

数据准备

library(Seurat)
library(tidyverse)
library(patchwork)
scRNA <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\data\\GSE157827_astro.rds")
res_dir <- "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\res"

pseudotime分析

library(monocle3)
expr <- GetAssayData(scRNA, slot = "counts")
cell_meta <- scRNA@meta.data
gene_meta <- data.frame(
  gene_short_name = rownames(expr), 
  row.names = rownames(expr)
)
cds <- new_cell_data_set(
  expr,
  cell_metadata = cell_meta,
  gene_metadata = gene_meta
)
# 直接使用Seurat的UMAP结果
reducedDims(cds)$UMAP <- Embeddings(scRNA, "umap")
cds <- cluster_cells(cds, reduction_method = "UMAP")
# 学习轨迹图
cds <- learn_graph(cds, use_partition = TRUE)
# try1：用braak.tangle.stage最小的细胞当root
earliest <- min(scRNA$braak.tangle.stage, na.rm = TRUE)
root_cells <- colnames(scRNA)[which(scRNA$braak.tangle.stage == earliest)]
# 排序
cds <- order_cells(cds, root_cells = root_cells)
pt <- monocle3::pseudotime(cds)
# 写回Seurat
scRNA$pseudotime <- pt[colnames(scRNA)]

# 轨迹与分组分布
FeaturePlot(scRNA, features = "pseudotime")  # 看轨迹轴在UMAP上的走向
VlnPlot(scRNA, features = "pseudotime", group.by = "group", pt.size = 0)
VlnPlot(scRNA, features = "pseudotime", group.by = "braak.tangle.stage", pt.size = 0)

细胞轨迹test1

似乎不是“整张UMAP从一端平滑过渡到另一端”的那种形态，大部分都接近0，只有小部分有颜色

细胞轨迹test2

与上张图分布相似，两组都大量堆在接近0的位置，只有少量细胞在1–2.5的尾部，说明pseudotime的“有效动态范围”只覆盖了一小部分细胞，所以组间整体差异不容易被拉开

细胞轨迹test3

stage也都是大量细胞集中在低pseudotime，且stage之间没有非常清晰的“随分期单调递增”的趋势，说明当前这条pseudotime轴与Braak分期的对齐程度不强，不能得出“AD一定更反应/更晚期”的结论

提示root可能选的过宽（我在6000细胞中选择600个分期最早的作为root），导致pseudotime本身就不太像“单一方向的进程”
Braak是样本级标签+分期不一定覆盖连续状态+样本间异质性大，导致不构成稳定梯度

# 单基因平滑曲线
genes <- c("SLC1A2","ALDH1L1","GFAP")
df <- FetchData(scRNA, vars = c("pseudotime","group", genes))
df_long <- pivot_longer(df, cols = all_of(genes), names_to = "gene", values_to = "expr")
ggplot(df_long, aes(x = pseudotime, y = expr, color = group)) +
  geom_point(size = 0.2, alpha = 0.2) +
  geom_smooth(se = FALSE, method = "loess", span = 0.3) +
  facet_wrap(~gene, scales = "free_y") +
  theme_classic()

细胞轨迹test4

GFAP逐渐上升、SLC1A2较平（理论上应该下降）、ALDH1L1逐渐上升（确实是表达增强，但可能受技术因素影响），说明pseudotime与astro某些状态变化有关

可能的原因：用earliest分期的大量细胞当root，导致root覆盖太广，所以绝大多数细胞的pseudotime接近0，只有离这些root都远的一部分区域出现高pseudotime

考虑到之前的分析中，我们在差异基因中选了最top的一些计算了模块分数，能不能用这个模块分数来辅助选root

# 用之前得到的划分好up/down的astro
scRNA_astro_sub <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\data\\GSE157827_astro_sub.rds")
scRNA_astro_sub$subpop <- factor(scRNA_astro_sub$subpop, levels = c("Down","Up"))
marker_Astro <- FindMarkers(
  scRNA_astro_sub, 
  group.by = "subpop",
  ident.1 = "Down", 
  ident.2 = "Up"
)
marker_Astro <- marker_Astro[order(marker_Astro$p_val_adj, decreasing = F), ]
marker_Astro_sig <- marker_Astro %>%
  rownames_to_column("gene") %>%
  filter(p_val_adj < 0.05, abs(avg_log2FC) >= 0.25) %>% 
  filter(pct.1 >= 0.1 | pct.2 >= 0.1) %>% 
  filter(!grepl("^(ENSG|MT-|LINC)|^(AC|AL)[0-9]", gene))
# Down子群体更高表达的基因
astro_down <- marker_Astro_sig %>%
  filter(avg_log2FC > 0) %>%
  arrange(p_val_adj, desc(abs(avg_log2FC))) %>%
  slice_head(n = 20) %>%
  pull(gene)
# Up子群体更高表达的基因
astro_up <- marker_Astro_sig %>%
  filter(avg_log2FC < 0) %>%
  arrange(p_val_adj, desc(abs(avg_log2FC))) %>%
  slice_head(n = 20) %>%
  pull(gene)
scRNA <- AddModuleScore(scRNA, features = list(astro_up), name = "ADup")
scRNA <- AddModuleScore(scRNA, features = list(astro_down), name = "ADdown")
FeaturePlot(scRNA, features = c("ADup1","ADdown1"))
VlnPlot(scRNA, features = c("ADup1","ADdown1"), group.by="group", pt.size=0)

细胞轨迹test5

细胞轨迹test6

在ADup和ADdown的层面，确实是形成了“状态区域”，也证明astro里确实存在“疾病相关表达程序”的整体偏移，提示可以使用up/down的模块分数来选root

try2：使用模块分数选出稳态端当root

稳态端：ADup1低+ADdown1高
反应端：ADup1高+ADdown1低

root_score <- scale(scRNA$ADdown1) - scale(scRNA$ADup1)
names(root_score) <- colnames(scRNA)
# 在earliest分期中
scRNA$braak_num <- suppressWarnings(as.numeric(as.character(scRNA$braak.tangle.stage)))
earliest <- min(scRNA$braak_num, na.rm = TRUE)
cand <- colnames(scRNA)[scRNA$braak_num == earliest]
# 取最稳态的前200个
cand <- cand[order(root_score[cand], decreasing = TRUE)]
root_cells <- head(cand, 200)
# 只要一条主轴
cds <- learn_graph(cds, use_partition = FALSE)
cds <- order_cells(cds, root_cells = root_cells)
scRNA$pseudotime <- monocle3::pseudotime(cds)[colnames(scRNA)]
# 画图代码同上

细胞轨迹test7

细胞轨迹test8

可以看到，umap图上变蓝的部分更多，AD-NC的pseudotime差异也更明显（AD更富集在“后段状态”），同时braak分期更晚的pseudotime更高（说明这条轴与braak有一定相关性）
SLC1A2随pseudotime变化不明显，GFAP后端状态与预期不符，可能是pseudotime高的细胞数在AD/NC不均衡，导致曲线在尾部容易被少量点带偏

想看一下“模块分数随pseudotime的趋势”

df <- FetchData(obj, vars = c("pseudotime", "group", "ADup1", "ADdown1")) %>%
  filter(!is.na(pseudotime))
df_long <- df %>%
  pivot_longer(cols = c("ADup1", "ADdown1"), names_to = "module", values_to = "score")
# 相关系数
cor.test(df$pseudotime, df$ADup1, method = "spearman")  # 总体相关
cor.test(df$pseudotime, df$ADdown1, method = "spearman")
df %>% group_by(group) %>%  # 分组相关
  summarise(
    rho_ADup   = cor(pseudotime, ADup1, method = "spearman"),
    rho_ADdown = cor(pseudotime, ADdown1, method = "spearman"),
    .groups = "drop"
  )
# LOESS平滑曲线
ggplot(df_long, aes(x = pseudotime, y = score, color = group)) +
  geom_point(size = 0.2, alpha = 0.12) +
  geom_smooth(se = FALSE, method = "loess", span = 0.35) +
  facet_wrap(~module, scales = "free_y") +
  theme_classic()
# 分箱均值曲线（避免尾部LOESS被少量点带偏）
df_long <- df_long %>%
  mutate(pt_bin = ggplot2::cut_number(pseudotime, n = 30))
sum_tab <- df_long %>%
  group_by(group, module, pt_bin) %>%
  summarise(
    pt_mid = mean(pseudotime),
    score_mean = mean(score),
    n = dplyr::n(),
    .groups = "drop"
  )
ggplot(sum_tab, aes(x = pt_mid, y = score_mean, color = group)) +
  geom_line() +
  geom_point(aes(size = n), alpha = 0.8) +
  facet_wrap(~module, scales = "free_y") +
  theme_classic() +
  guides(size = "none")

data:  df$pseudotime and df$ADup1
S = 3.3663e+10, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.2405992 

data:  df$pseudotime and df$ADdown1
S = 5.5218e+10, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.2456553 

group  rho_ADup  rho_ADdown
NC	0.07784422	-0.2370371	
AD	0.33427566	-0.1747888	

细胞轨迹test9

ADdown1：NC全程都高于AD，并随pseudotime整体缓慢下降（ADdown基因集作为稳态/保护/对照端程序，在“后段状态”逐渐变弱
ADup1：AD在pseudotime后段明显上升，且上升幅度较NC更大（ADup基因集在“后段状态”增强，而且主要发生在AD细胞里）
相关系数的方向符合预期，rho值小说明“有信号但不是一条单因素决定的轴”（在AD这种复杂疾病状态谱里很正常）
分组相关系数说明这条pseudotime轴更能捕捉“AD中出现的反应/疾病相关程序的进行”

Slingshot+tradeSeq

数据准备：读取之前的Seurat对象并添加ADup和ADdown分数

library(Seurat)
library(tidyverse)
library(patchwork)
library(SingleCellExperiment)
library(slingshot)
library(tradeSeq)
library(scater)
res_dir <- "C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\res"
sceu <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\data\\GSE157827_astro.rds")
sceu_astro_sub <- readRDS("C:\\Users\\17185\\Desktop\\hERV_calc\\GSE157827\\data\\GSE157827_astro_sub.rds")
sceu_astro_sub$subpop <- factor(sceu_astro_sub$subpop, levels = c("Down","Up"))
marker_Astro <- FindMarkers(
  sceu_astro_sub, 
  group.by = "subpop",
  ident.1 = "Down", 
  ident.2 = "Up"
)
marker_Astro <- marker_Astro[order(marker_Astro$p_val_adj, decreasing = F), ]
marker_Astro_sig <- marker_Astro %>%
  rownames_to_column("gene") %>%
  filter(p_val_adj < 0.05, abs(avg_log2FC) >= 0.25) %>% 
  filter(pct.1 >= 0.1 | pct.2 >= 0.1) %>% 
  filter(!grepl("^(ENSG|MT-|LINC)|^(AC|AL)[0-9]", gene))
astro_down <- marker_Astro_sig %>%
  filter(avg_log2FC > 0) %>%
  arrange(p_val_adj, desc(abs(avg_log2FC))) %>%
  slice_head(n = 20) %>%
  pull(gene)
astro_up <- marker_Astro_sig %>%
  filter(avg_log2FC < 0) %>%
  arrange(p_val_adj, desc(abs(avg_log2FC))) %>%
  slice_head(n = 20) %>%
  pull(gene)
sceu <- AddModuleScore(sceu, features = list(astro_up), name = "ADup")
sceu <- AddModuleScore(sceu, features = list(astro_down), name = "ADdown")

Seurat对象 → SingleCellExperiment对象 → 使用Slingshot建轨迹+pseudotime

sce <- as.SingleCellExperiment(sceu)
reducedDim(sce, "UMAP") <- Embeddings(sceu, "umap")
clus <- sceu$seurat_clusters
# 根据上回的经验，直接用更ADdown的细胞做起始cluster
score <- sceu$ADdown1 - sceu$ADup1
tab0 <- data.frame(cl = clus, score = score, group = sceu$group) %>%
  group_by(cl) %>%
  summarise(score_med = median(score, na.rm = TRUE),
            nc_prop   = mean(group == "NC", na.rm = TRUE),
            n = n(), .groups="drop") %>%
  arrange(desc(score_med), desc(nc_prop))
start_clus <- as.character(tab0$cl[1])
# slingshot
sce <- slingshot(
  sce,
  clusterLabels = "seurat_clusters",
  reducedDim = "UMAP",
  start.clus = start_clus
)
# UMAP + 轨迹曲线
plot(reducedDim(sce, "UMAP"), col = "grey80", pch = 16, cex = 0.4,
     xlab = "UMAP_1", ylab = "UMAP_2")
lines(SlingshotDataSet(sce), lwd = 2, col = "black")
# pseudotime（如果多条轴，先用第一条/主线）
pt_mat <- slingPseudotime(sce)
cw_mat <- slingCurveWeights(sce)
sceu$pseudotime_sling <- pt_mat[, 1]

细胞轨迹test10

> ncol(slingPseudotime(sce))
[1] 4
> table(sceu$seurat_clusters)
   0    1    2    3    4    5    6    7    8 
1799 1457  805  621  545  476  427  184  117 

这说明有4条可行的主路径（分支或不同终点），同时UMAP图中的曲线有交叉和回环，可能是因为UMAP的几何会扭曲全局结构（因此也有一种做法是在PCA空间中进行Slingshot），同时cluster分的较细且彼此连接关系复杂，导致Slingshot会给出多条可能路径，这里通常就需要选一条“最像反应性/激活轴”的主线来解释，这里为了简单测试就直接取了第一条，如果想认真选的话，可以用cellWeights统计每条lineage覆盖的细胞数，选覆盖最多的一条当主线，并只用这一条做后面的tradeSeq

# pseudotime是否与ADup/ADdown对齐
FeaturePlot(sceu, "pseudotime_sling") | VlnPlot(sceu, "pseudotime_sling", group.by = "group", pt.size = 0)
# 模块分数随pseudotime的趋势（辅助解释轨迹方向）
df <- FetchData(sceu, vars = c("pseudotime_sling","group","ADup1","ADdown1")) %>%
  filter(!is.na(pseudotime_sling))
df_long <- df %>%
  pivot_longer(cols = c("ADup1","ADdown1"), names_to = "module", values_to = "score") %>%
  mutate(pt_bin = ggplot2::cut_number(pseudotime_sling, n = 30))
sum_tab <- df_long %>%
  group_by(group, module, pt_bin) %>%
  summarise(
    pt_mid = mean(pseudotime_sling),
    score_mean = mean(score),
    n = n(),
    .groups = "drop"
  ) %>%
  filter(n >= 50)  # 可改 30/50/100
p1 <- ggplot(sum_tab, aes(x = pt_mid, y = score_mean, color = group)) +
  geom_line() +
  geom_point(aes(size = n), alpha = 0.8) +
  facet_wrap(~module, scales = "free_y") +
  theme_classic() +
  guides(size = "none")
# 样本级分析
meta <- FetchData(sceu, vars = c("pseudotime_sling","group","orig.ident")) %>%
  filter(!is.na(pseudotime_sling))
sample_tab <- meta %>%
  group_by(orig.ident, group) %>%
  summarise(pt_median = median(pseudotime_sling),
            pt_mean = mean(pseudotime_sling),
            .groups="drop")
wilcox.test(pt_median ~ group, data = sample_tab)
p2 <- ggplot(sample_tab, aes(x = group, y = pt_median)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.15, height = 0) +
  theme_classic()
p1 | p2

细胞轨迹test11

细胞轨迹test12

这里的结果和上面monocle3的类似（AD与NC的细胞分布存在偏移，并且与ADup/ADdown模块分数呈一致的趋势变化），说明选的这条线路很可能是在刻画“稳态 → 反应/应激”的连续变化

tradeSeq：轨迹上“变化基因”与“AD/NC 曲线不同”的基因

# 准备counts+选择基因集合（这里作为测试只使用HVG）
use_genes <- VariableFeatures(sceu)
counts <- GetAssayData(sceu, slot = "counts")[use_genes, ]
# 过滤NA
pt1 <- pt_mat[, 1, drop = FALSE]
cw1 <- cw_mat[, 1, drop = FALSE]
keep <- !is.na(pt1[,1]) & (cw1[,1] > 0)
counts2 <- counts[, keep]
pt1_2   <- pt1[keep, , drop = FALSE]
cw1_2   <- cw1[keep, , drop = FALSE]
cond2   <- factor(sceu$group[keep])
# 拟合GAM
cond <- sceu$group
set.seed(1)
gam <- fitGAM(
  counts = counts2,
  pseudotime = pt1_2,
  cellWeights = cw1_2,
  conditions = cond2,
  nknots = 6,
  verbose = TRUE
)
# 哪些基因沿轨迹变化（不分组）
asso <- associationTest(gam)
asso <- asso[order(asso$pvalue), ]
# 哪些基因在轨迹上AD/NC曲线不同
ct <- conditionTest(gam)
ct <- ct[order(ct$pvalue), ]

associationTest：找沿轨迹变化的基因（基础轨迹信号）
conditionTest：找“同一轨迹位置上，AD-NC表达曲线不同”的基因（疾病差异）

> head(asso, 10)
            waldStat df pvalue meanLogFC
KCNIP4     2119.4446 11      0 1.1746620
CSMD1      1911.0154 11      0 1.1147822
SYT1       1920.7033 11      0 1.3592505
AC109466.1  266.5855 11      0 1.2754283
ATRNL1     1310.6656 11      0 1.8489925
SNTG1      1336.8417 11      0 1.1888490
HS6ST3      996.8393 11      0 2.2509793
NRXN3      2272.4418 11      0 1.8331828
RIMS2      1343.5195 11      0 1.5773849
NEGR1       839.9709 11      0 0.7599727
> head(ct, 10)
         waldStat df pvalue
KCND2    90.97927  6      0
CADPS   117.12250  6      0
DPP10   170.40654  6      0
AGBL4   150.26273  6      0
MT-CO3   89.87541  6      0
MT-ATP6 208.06991  6      0
MT-CO2  148.37502  6      0
PHACTR1  99.24620  6      0
MT-ND3  196.30111  6      0
MT-ND4  118.87843  6      0

# top基因在轨迹上的平滑曲线
top_asso <- rownames(asso)[1:6]
top_ct   <- rownames(ct)[1:6]
association_top_genes <- plotSmoothers(gam, counts, gene = top_asso, alpha = 1)
conditionTest_top_genes <- plotSmoothers(gam, counts, gene = top_ct, alpha = 1)
association_top_genes | conditionTest_top_genes

细胞轨迹test13

这里得到的top基因很多都是神经元/突触相关基因，可能是因为细胞集内混入了神经元/双细胞/环境RNA，或者技术差异（比如线粒体基因含量）的影响。如果后续要进行分析，可能要剔除掉有“污染/双细胞”的cluster