Preranked GSEA#
Setup#
Download a MEME motif database file containing human transcription factor binding motifs from the Cis-BP resource, alongside the human genome in fasta format.
%%bash
[ -f data/cisBP_human.meme ] || wget -O data/cisBP_human.meme https://osf.io/download/uk6vn
[ -f data/GRCh38.fa ] || wget -O data/GRCh38.fa.gz https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.p13.genome.fa.gz
[ -f data/GRCh38.fa ] || gunzip -c data/GRCh38.fa.gz > data/GRCh38.fa
import gatac as ga
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
sc.set_figure_params(dpi=100, frameon=False)
MOTIF_FILE = "data/cisBP_human.meme"
peak_adata = ad.read_h5ad("data/pbmc_peaks.h5ad")
# Get marker peaks for cluster 0
markers = ga.tl.marker_peaks(peak_adata, "leiden")
print("Available clusters:", list(markers.keys()))
15:43:10 - gatac.tl.markers - INFO - Running marker peak detection for groups in 'leiden'
15:43:10 - gatac.tl.markers - INFO - Testing 13 groups, 178,350 features, 5,001 cells
15:43:10 - gatac.tl.markers - INFO - Using max_cells=500 per group for balanced comparison
15:43:12 - gatac.tl.markers - INFO - Marker peak detection complete. Results stored in adata.uns['marker_peaks']
Available clusters: ['0', '1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9']
1. Prepare ranked peak list#
# Use cluster 0 markers ranked by log2 fold-change
cluster = "0"
marker_df = markers[cluster].sort_values("log2_fc", ascending=False)
# Build a DataFrame indexed by peak name with the expected column name
rankings = marker_df.set_index("feature")[["log2_fc"]].rename(columns={"log2_fc": "log2fc"})
print(f"Ranked {len(rankings):,} peaks for cluster {cluster}")
Ranked 5,575 peaks for cluster 0
2. Build motif gene sets#
Map each motif to the set of peaks that contain that motif.
motifs = ga.tl.read_motifs(MOTIF_FILE, unique=True)
ga.tl.scan_motifs(
peak_adata,
motifs=motifs,
genome_fasta="data/GRCh38.fa",
)
motif_names = [m.name for m in motifs]
# Build gene sets: motif → list of peaks containing that motif
motif_matrix = peak_adata.varm["motif_match"]
import scipy.sparse as sp
peak_names = peak_adata.var_names.tolist()
motif_terms = {
motif_names[j]: [peak_names[i] for i in motif_matrix[:, j].nonzero()[0]]
for j in range(motif_matrix.shape[1])
if motif_matrix[:, j].nnz > 0
}
print(f"Built {len(motif_terms)} motif gene sets")
15:43:18 - gatac.tl.chromvar - INFO - Scanning 1165 motifs in 178350 peaks...
15:43:18 - gatac.tl.chromvar - INFO - Using 0-based coordinate system
15:43:18 - gatac.tl.chromvar - INFO - Fetching sequences from genome...
15:43:20 - gatac.tl.chromvar - INFO - Background from sequences: A=0.2589 C=0.2412 G=0.2411 T=0.2588
15:43:20 - gatac.tl.chromvar - INFO - Computing score thresholds...
15:43:24 - gatac.tl.chromvar - INFO - Scanning motifs on GPU in 4 batch(es) of 50000 peaks...
motif_matrix = peak_adata.varm["motif_match"]
import scipy.sparse as sp
peak_names = peak_adata.var_names.tolist()
motif_terms = {
motif_names[j]: [peak_names[i] for i in motif_matrix[:, j].nonzero()[0]]
for j in range(motif_matrix.shape[1])
if motif_matrix[:, j].nnz > 0
}
print(f"Built {len(motif_terms)} motif gene sets")
Built 1165 motif gene sets
3. Run preranked GSEA#
import time
t0 = time.perf_counter()
gsea_df = ga.tl.gsea_motif_enrichment(
peak_adata,
rankings=rankings,
)
print(f"GSEA in {time.perf_counter() - t0:.1f}s")
gsea_df.nsmallest(10, "fdr")
17:24:26 - gatac.tl.motif - INFO - Built 1,165 motif feature sets (≥15 peaks; filter to ≥15)
17:24:26 - gatac.tl.motif - INFO - Running GSEA (gpu) on provided rankings...
17:24:27 - gatac.tl.gsea - INFO - GPU GSEA: 1137 feature sets, 5305 features, 1000 permutations (gs_batch=4, perm_batch=256)
17:24:27 - gatac.tl.gsea - INFO - Generating permutation indices...
GSEA in 11.1s
| motif | NES | pval | fdr | lead_edge_n | set_size | lead_edge_frac | |
|---|---|---|---|---|---|---|---|
| 0 | CEBPA | 5.364148 | 0.0 | 0.0 | 107 | 315 | 0.339683 |
| 1 | CEBPB | 4.925610 | 0.0 | 0.0 | 66 | 160 | 0.412500 |
| 2 | CEBPG | 4.235527 | 0.0 | 0.0 | 59 | 129 | 0.457364 |
| 3 | CEBPE | 3.591549 | 0.0 | 0.0 | 41 | 101 | 0.405941 |
| 4 | CEBPD | 3.550723 | 0.0 | 0.0 | 42 | 113 | 0.371681 |
| 5 | NFE2L2 | 3.454037 | 0.0 | 0.0 | 95 | 356 | 0.266854 |
| 6 | BACH1 | 3.372003 | 0.0 | 0.0 | 96 | 360 | 0.266667 |
| 7 | FOSL2 | 2.807798 | 0.0 | 0.0 | 86 | 345 | 0.249275 |
| 8 | TEF | 2.780295 | 0.0 | 0.0 | 33 | 94 | 0.351064 |
| 9 | BACH2 | 2.765251 | 0.0 | 0.0 | 117 | 402 | 0.291045 |
4. GSEA enrichment plot#
top_results = gsea_df.nsmallest(20, "fdr")
fig, ax = plt.subplots(figsize=(7, 6))
colors = ["#d62728" if nes > 0 else "#1f77b4" for nes in top_results["NES"]]
bars = ax.barh(range(len(top_results)), top_results["NES"], color=colors)
ax.set_yticks(range(len(top_results)))
ax.set_yticklabels(top_results["motif"])
ax.axvline(0, color="black", lw=0.8)
ax.set_xlabel("Normalized Enrichment Score")
ax.set_title(f"Top GSEA motif results — cluster {cluster}")
plt.tight_layout()
plt.show()